"""
@app.route('/')
def index():
return render_template_string(HTML_TEMPLATE)
@app.route('/upload', methods=['POST'])
def upload_file():
try:
print("\n" + "="*60)
print("收到上传请求")
if 'file' not in request.files:
print("错误: 没有文件上传")
return jsonify({'success': False, 'error': '没有文件上传'})
file = request.files['file']
print(f"文件名: {file.filename}")
if file.filename == '':
print("错误: 文件名为空")
return jsonify({'success': False, 'error': '文件名为空'})
if not file.filename.lower().endswith('.pdf'):
print("错误: 不是PDF文件")
return jsonify({'success': False, 'error': '只支持PDF文件'})
if not PDF_LIBRARY:
return jsonify({'success': False, 'error': '未安装PDF处理库,请安装: pip install PyPDF2'})
# 读取PDF内容
pdf_bytes = file.read()
print(f"PDF大小: {len(pdf_bytes)} 字节")
html_content = pdf_to_html(pdf_bytes)
print(f"转换成功,HTML长度: {len(html_content)} 字符")
return jsonify({'success': True, 'html': html_content})
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
print(f"错误详情:\n{traceback.format_exc()}")
return jsonify({'success': False, 'error': error_msg})
@app.route('/export', methods=['POST'])
def export_pdf():
try:
print("\n" + "="*60)
print("收到导出请求")
if not PDF_EXPORT:
return jsonify({'success': False, 'error': '未安装PDF导出库,请安装: pip install reportlab'}), 500
data = request.get_json()
html_content = data.get('html', '')
print(f"HTML内容长度: {len(html_content)} 字符")
# 使用reportlab生成PDF
pdf_buffer = io.BytesIO()
doc = SimpleDocTemplate(pdf_buffer, pagesize=A4)
story = []
# 尝试注册中文字体(如果可用)
try:
# Windows系统字体路径
font_paths = [
'C:/Windows/Fonts/simhei.ttf', # 黑体
'C:/Windows/Fonts/simsun.ttc', # 宋体
'C:/Windows/Fonts/msyh.ttc', # 微软雅黑
]
for font_path in font_paths:
if os.path.exists(font_path):
pdfmetrics.registerFont(TTFont('ChineseFont', font_path))
font_name = 'ChineseFont'
print(f"✓ 成功加载字体: {font_path}")
break
else:
font_name = 'Helvetica'
print("⚠ 未找到中文字体,使用默认字体")
except Exception as e:
font_name = 'Helvetica'
print(f"⚠ 字体加载失败: {e}")
# 创建样式
styles = getSampleStyleSheet()
custom_style = ParagraphStyle(
'CustomStyle',
parent=styles['Normal'],
fontName=font_name,
fontSize=12,
leading=18,
alignment=TA_LEFT,
)
# 从HTML提取文本
from html.parser import HTMLParser
class TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text_parts = []
self.current_tag = None
def handle_starttag(self, tag, attrs):
self.current_tag = tag
def handle_data(self, data):
text = data.strip()
if text:
self.text_parts.append((self.current_tag, text))
def handle_endtag(self, tag):
self.current_tag = None
extractor = TextExtractor()
extractor.feed(html_content)
# 添加内容到PDF
for tag, text in extractor.text_parts:
if tag in ['h1', 'h2', 'h3']:
# 标题样式
title_style = ParagraphStyle(
'TitleStyle',
parent=custom_style,
fontSize=16 if tag == 'h2' else 14,
textColor='#333333',
spaceAfter=12,
)
para = Paragraph(text, title_style)
else:
# 普通段落
para = Paragraph(text, custom_style)
story.append(para)
story.append(Spacer(1, 0.1*inch))
# 生成PDF
doc.build(story)
pdf_buffer.seek(0)
print("✓ PDF生成成功")
# 返回PDF文件
return send_file(
pdf_buffer,
mimetype='application/pdf',
as_attachment=True,
download_name=f'edited_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pdf'
)
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
print(f"错误详情:\n{traceback.format_exc()}")
return jsonify({'success': False, 'error': error_msg}), 500
def pdf_to_html(pdf_bytes):
"""将PDF转换为HTML内容"""
html_parts = []
try:
if PDF_LIBRARY == 'pypdf2':
# 使用PyPDF2
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
total_pages = len(pdf_reader.pages)
print(f"PDF总页数: {total_pages}")
for page_num in range(total_pages):
page = pdf_reader.pages[page_num]
text = page.extract_text()
if text:
lines = text.split('\n')
html_parts.append(f'
第 {page_num + 1} 页
')
for line in lines:
line = line.strip()
if line:
# 转义HTML特殊字符
line = (line.replace('&', '&')
.replace('<', '<')
.replace('>', '>')
.replace('"', '"'))
html_parts.append(f'
{line}
')
html_parts.append('')
elif PDF_LIBRARY == 'pdfplumber':
# 使用pdfplumber
import pdfplumber
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
total_pages = len(pdf.pages)
print(f"PDF总页数: {total_pages}")
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text:
lines = text.split('\n')
html_parts.append(f'
第 {page_num} 页
')
for line in lines:
line = line.strip()
if line:
line = (line.replace('&', '&')
.replace('<', '<')
.replace('>', '>')
.replace('"', '"'))
html_parts.append(f'
{line}
')
html_parts.append('')
else:
return '
错误: 未安装PDF处理库
'
if not html_parts:
return '
⚠ 未能提取到文本内容。 PDF可能是扫描版或纯图片格式。 建议使用OCR工具转换后再试。
'
return '\n'.join(html_parts)
except Exception as e:
error_msg = f"PDF解析错误: {type(e).__name__}: {str(e)}"
print(error_msg)
print(traceback.format_exc())
return f'
{error_msg}
'
if __name__ == '__main__':
print("\n" + "="*60)
print("📄 PDF转可编辑网页工具")
print("="*60)
# 检查依赖
print("\n依赖检查:")
if PDF_LIBRARY:
print(f" ✓ PDF读取: {PDF_LIBRARY}")
else:
print(" ✗ PDF读取库未安装")
print(" 请运行: pip install PyPDF2")
if PDF_EXPORT:
print(f" ✓ PDF导出: {PDF_EXPORT}")
else:
print(" ✗ PDF导出库未安装")
print(" 请运行: pip install reportlab")
if not PDF_LIBRARY or not PDF_EXPORT:
print("\n⚠ 警告: 缺少必要的依赖库")
print("请先安装所需的库,然后重新运行程序\n")
print("\n服务器信息:")
print(" 地址: http://localhost:5000")
print(" 或者: http://127.0.0.1:5000")
print("\n按 Ctrl+C 停止服务器")
print("="*60 + "\n")
app.run(debug=True, host='0.0.0.0', port=5000)