""" PDF转可编辑网页工具 支持PDF上传、在线编辑、导出为PDF 使用方法: 1. 安装依赖: pip install flask PyPDF2 reportlab 2. 运行脚本: python pdf_editor.py 3. 浏览器打开: http://localhost:5000 """ from flask import Flask, render_template_string, request, send_file, jsonify import io import os import traceback from datetime import datetime # 尝试导入PDF处理库 try: import PyPDF2 PDF_LIBRARY = 'pypdf2' print("✓ 使用 PyPDF2 库") except ImportError: print("✗ PyPDF2 未安装,尝试使用 pdfplumber") try: import pdfplumber PDF_LIBRARY = 'pdfplumber' print("✓ 使用 pdfplumber 库") except ImportError: print("✗ 警告: 没有安装PDF处理库") PDF_LIBRARY = None # 尝试导入PDF生成库 try: from reportlab.lib.pagesizes import letter, A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import inch from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.lib.enums import TA_LEFT PDF_EXPORT = 'reportlab' print("✓ 使用 reportlab 导出PDF") except ImportError: print("✗ reportlab 未安装") PDF_EXPORT = None app = Flask(__name__) app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 限制16MB # HTML模板 HTML_TEMPLATE = """ PDF编辑器

📄 PDF编辑器

上传PDF文件,在线编辑,一键导出

📁

拖放PDF文件到此处

支持最大16MB的PDF文件

正在处理PDF文件,请稍候...

编辑内容

""" @app.route('/') def index(): return render_template_string(HTML_TEMPLATE) @app.route('/upload', methods=['POST']) def upload_file(): try: print("\n" + "="*60) print("收到上传请求") if 'file' not in request.files: print("错误: 没有文件上传") return jsonify({'success': False, 'error': '没有文件上传'}) file = request.files['file'] print(f"文件名: {file.filename}") if file.filename == '': print("错误: 文件名为空") return jsonify({'success': False, 'error': '文件名为空'}) if not file.filename.lower().endswith('.pdf'): print("错误: 不是PDF文件") return jsonify({'success': False, 'error': '只支持PDF文件'}) if not PDF_LIBRARY: return jsonify({'success': False, 'error': '未安装PDF处理库,请安装: pip install PyPDF2'}) # 读取PDF内容 pdf_bytes = file.read() print(f"PDF大小: {len(pdf_bytes)} 字节") html_content = pdf_to_html(pdf_bytes) print(f"转换成功,HTML长度: {len(html_content)} 字符") return jsonify({'success': True, 'html': html_content}) except Exception as e: error_msg = f"{type(e).__name__}: {str(e)}" print(f"错误详情:\n{traceback.format_exc()}") return jsonify({'success': False, 'error': error_msg}) @app.route('/export', methods=['POST']) def export_pdf(): try: print("\n" + "="*60) print("收到导出请求") if not PDF_EXPORT: return jsonify({'success': False, 'error': '未安装PDF导出库,请安装: pip install reportlab'}), 500 data = request.get_json() html_content = data.get('html', '') print(f"HTML内容长度: {len(html_content)} 字符") # 使用reportlab生成PDF pdf_buffer = io.BytesIO() doc = SimpleDocTemplate(pdf_buffer, pagesize=A4) story = [] # 尝试注册中文字体(如果可用) try: # Windows系统字体路径 font_paths = [ 'C:/Windows/Fonts/simhei.ttf', # 黑体 'C:/Windows/Fonts/simsun.ttc', # 宋体 'C:/Windows/Fonts/msyh.ttc', # 微软雅黑 ] for font_path in font_paths: if os.path.exists(font_path): pdfmetrics.registerFont(TTFont('ChineseFont', font_path)) font_name = 'ChineseFont' print(f"✓ 成功加载字体: {font_path}") break else: font_name = 'Helvetica' print("⚠ 未找到中文字体,使用默认字体") except Exception as e: font_name = 'Helvetica' print(f"⚠ 字体加载失败: {e}") # 创建样式 styles = getSampleStyleSheet() custom_style = ParagraphStyle( 'CustomStyle', parent=styles['Normal'], fontName=font_name, fontSize=12, leading=18, alignment=TA_LEFT, ) # 从HTML提取文本 from html.parser import HTMLParser class TextExtractor(HTMLParser): def __init__(self): super().__init__() self.text_parts = [] self.current_tag = None def handle_starttag(self, tag, attrs): self.current_tag = tag def handle_data(self, data): text = data.strip() if text: self.text_parts.append((self.current_tag, text)) def handle_endtag(self, tag): self.current_tag = None extractor = TextExtractor() extractor.feed(html_content) # 添加内容到PDF for tag, text in extractor.text_parts: if tag in ['h1', 'h2', 'h3']: # 标题样式 title_style = ParagraphStyle( 'TitleStyle', parent=custom_style, fontSize=16 if tag == 'h2' else 14, textColor='#333333', spaceAfter=12, ) para = Paragraph(text, title_style) else: # 普通段落 para = Paragraph(text, custom_style) story.append(para) story.append(Spacer(1, 0.1*inch)) # 生成PDF doc.build(story) pdf_buffer.seek(0) print("✓ PDF生成成功") # 返回PDF文件 return send_file( pdf_buffer, mimetype='application/pdf', as_attachment=True, download_name=f'edited_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pdf' ) except Exception as e: error_msg = f"{type(e).__name__}: {str(e)}" print(f"错误详情:\n{traceback.format_exc()}") return jsonify({'success': False, 'error': error_msg}), 500 def pdf_to_html(pdf_bytes): """将PDF转换为HTML内容""" html_parts = [] try: if PDF_LIBRARY == 'pypdf2': # 使用PyPDF2 pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes)) total_pages = len(pdf_reader.pages) print(f"PDF总页数: {total_pages}") for page_num in range(total_pages): page = pdf_reader.pages[page_num] text = page.extract_text() if text: lines = text.split('\n') html_parts.append(f'

第 {page_num + 1} 页

') for line in lines: line = line.strip() if line: # 转义HTML特殊字符 line = (line.replace('&', '&') .replace('<', '<') .replace('>', '>') .replace('"', '"')) html_parts.append(f'

{line}

') html_parts.append('
') elif PDF_LIBRARY == 'pdfplumber': # 使用pdfplumber import pdfplumber with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: total_pages = len(pdf.pages) print(f"PDF总页数: {total_pages}") for page_num, page in enumerate(pdf.pages, 1): text = page.extract_text() if text: lines = text.split('\n') html_parts.append(f'

第 {page_num} 页

') for line in lines: line = line.strip() if line: line = (line.replace('&', '&') .replace('<', '<') .replace('>', '>') .replace('"', '"')) html_parts.append(f'

{line}

') html_parts.append('
') else: return '

错误: 未安装PDF处理库

' if not html_parts: return '

⚠ 未能提取到文本内容。
PDF可能是扫描版或纯图片格式。
建议使用OCR工具转换后再试。

' return '\n'.join(html_parts) except Exception as e: error_msg = f"PDF解析错误: {type(e).__name__}: {str(e)}" print(error_msg) print(traceback.format_exc()) return f'

{error_msg}

' if __name__ == '__main__': print("\n" + "="*60) print("📄 PDF转可编辑网页工具") print("="*60) # 检查依赖 print("\n依赖检查:") if PDF_LIBRARY: print(f" ✓ PDF读取: {PDF_LIBRARY}") else: print(" ✗ PDF读取库未安装") print(" 请运行: pip install PyPDF2") if PDF_EXPORT: print(f" ✓ PDF导出: {PDF_EXPORT}") else: print(" ✗ PDF导出库未安装") print(" 请运行: pip install reportlab") if not PDF_LIBRARY or not PDF_EXPORT: print("\n⚠ 警告: 缺少必要的依赖库") print("请先安装所需的库,然后重新运行程序\n") print("\n服务器信息:") print(" 地址: http://localhost:5000") print(" 或者: http://127.0.0.1:5000") print("\n按 Ctrl+C 停止服务器") print("="*60 + "\n") app.run(debug=True, host='0.0.0.0', port=5000)