Editor / app.py
mike23415's picture
Update app.py
c770f0d verified
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import fitz # PyMuPDF
import pytesseract
from pdf2image import convert_from_bytes
import io
app = Flask(__name__)
CORS(app)
@app.route('/upload', methods=['POST'])
def upload_pdf():
if 'pdf' not in request.files:
return jsonify({'error': 'No PDF uploaded'}), 400
pdf_file = request.files['pdf']
pdf_bytes = pdf_file.read()
# Digital text extraction
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
digital = []
for page in doc:
blocks = page.get_text("dict")["blocks"]
text_blocks = []
for b in blocks:
if "lines" in b:
text = " ".join(span["text"] for line in b["lines"] for span in line["spans"])
text_blocks.append({"text": text})
digital.append({
"page": page.number + 1,
"digital_text": text_blocks
})
# OCR extraction
images = convert_from_bytes(pdf_bytes)
ocr = []
for img in images:
text = pytesseract.image_to_string(img)
ocr.append({
"ocr_text": text
})
return jsonify({
"digital": digital,
"ocr": ocr
})
@app.route('/rebuild', methods=['POST'])
def rebuild_pdf():
if 'pdf' not in request.files:
return jsonify({'error': 'PDF is required'}), 400
if 'edits' not in request.form:
return jsonify({'error': 'Edits are required'}), 400
pdf_file = request.files['pdf']
edits = eval(request.form['edits']) # NOTE: In production, use json.loads
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
for page_data in edits:
page_num = page_data['pageNumber']
digital_blocks = page_data.get('digitalText', [])
ocr_text = page_data.get('ocrText', '')
page = doc[page_num - 1]
y = 100
for block in digital_blocks:
text = block.get('text', '')
page.insert_text((50, y), text, fontsize=12)
y += 20
if ocr_text:
page.insert_text((50, y + 20), ocr_text, fontsize=12, color=(1, 0, 0))
output_pdf = io.BytesIO()
doc.save(output_pdf)
output_pdf.seek(0)
return send_file(output_pdf, as_attachment=True, download_name='edited_output.pdf', mimetype='application/pdf')
if __name__ == '__main__':
app.run(host="0.0.0.0", port=7860)