from flask import Flask, request, jsonify, send_file from flask_cors import CORS import fitz # PyMuPDF import pytesseract from pdf2image import convert_from_bytes import io app = Flask(__name__) CORS(app) @app.route('/upload', methods=['POST']) def upload_pdf(): if 'pdf' not in request.files: return jsonify({'error': 'No PDF uploaded'}), 400 pdf_file = request.files['pdf'] pdf_bytes = pdf_file.read() # Digital text extraction doc = fitz.open(stream=pdf_bytes, filetype="pdf") digital = [] for page in doc: blocks = page.get_text("dict")["blocks"] text_blocks = [] for b in blocks: if "lines" in b: text = " ".join(span["text"] for line in b["lines"] for span in line["spans"]) text_blocks.append({"text": text}) digital.append({ "page": page.number + 1, "digital_text": text_blocks }) # OCR extraction images = convert_from_bytes(pdf_bytes) ocr = [] for img in images: text = pytesseract.image_to_string(img) ocr.append({ "ocr_text": text }) return jsonify({ "digital": digital, "ocr": ocr }) @app.route('/rebuild', methods=['POST']) def rebuild_pdf(): if 'pdf' not in request.files: return jsonify({'error': 'PDF is required'}), 400 if 'edits' not in request.form: return jsonify({'error': 'Edits are required'}), 400 pdf_file = request.files['pdf'] edits = eval(request.form['edits']) # NOTE: In production, use json.loads doc = fitz.open(stream=pdf_file.read(), filetype="pdf") for page_data in edits: page_num = page_data['pageNumber'] digital_blocks = page_data.get('digitalText', []) ocr_text = page_data.get('ocrText', '') page = doc[page_num - 1] y = 100 for block in digital_blocks: text = block.get('text', '') page.insert_text((50, y), text, fontsize=12) y += 20 if ocr_text: page.insert_text((50, y + 20), ocr_text, fontsize=12, color=(1, 0, 0)) output_pdf = io.BytesIO() doc.save(output_pdf) output_pdf.seek(0) return send_file(output_pdf, as_attachment=True, download_name='edited_output.pdf', mimetype='application/pdf') if __name__ == '__main__': app.run(host="0.0.0.0", port=7860)