|
|
from flask import Flask, request, jsonify, send_file |
|
|
from flask_cors import CORS |
|
|
import fitz |
|
|
import pytesseract |
|
|
from pdf2image import convert_from_bytes |
|
|
import io |
|
|
|
|
|
app = Flask(__name__) |
|
|
CORS(app) |
|
|
|
|
|
@app.route('/upload', methods=['POST']) |
|
|
def upload_pdf(): |
|
|
if 'pdf' not in request.files: |
|
|
return jsonify({'error': 'No PDF uploaded'}), 400 |
|
|
|
|
|
pdf_file = request.files['pdf'] |
|
|
pdf_bytes = pdf_file.read() |
|
|
|
|
|
|
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
digital = [] |
|
|
for page in doc: |
|
|
blocks = page.get_text("dict")["blocks"] |
|
|
text_blocks = [] |
|
|
for b in blocks: |
|
|
if "lines" in b: |
|
|
text = " ".join(span["text"] for line in b["lines"] for span in line["spans"]) |
|
|
text_blocks.append({"text": text}) |
|
|
digital.append({ |
|
|
"page": page.number + 1, |
|
|
"digital_text": text_blocks |
|
|
}) |
|
|
|
|
|
|
|
|
images = convert_from_bytes(pdf_bytes) |
|
|
ocr = [] |
|
|
for img in images: |
|
|
text = pytesseract.image_to_string(img) |
|
|
ocr.append({ |
|
|
"ocr_text": text |
|
|
}) |
|
|
|
|
|
return jsonify({ |
|
|
"digital": digital, |
|
|
"ocr": ocr |
|
|
}) |
|
|
|
|
|
|
|
|
@app.route('/rebuild', methods=['POST']) |
|
|
def rebuild_pdf(): |
|
|
if 'pdf' not in request.files: |
|
|
return jsonify({'error': 'PDF is required'}), 400 |
|
|
if 'edits' not in request.form: |
|
|
return jsonify({'error': 'Edits are required'}), 400 |
|
|
|
|
|
pdf_file = request.files['pdf'] |
|
|
edits = eval(request.form['edits']) |
|
|
|
|
|
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") |
|
|
|
|
|
for page_data in edits: |
|
|
page_num = page_data['pageNumber'] |
|
|
digital_blocks = page_data.get('digitalText', []) |
|
|
ocr_text = page_data.get('ocrText', '') |
|
|
|
|
|
page = doc[page_num - 1] |
|
|
y = 100 |
|
|
|
|
|
for block in digital_blocks: |
|
|
text = block.get('text', '') |
|
|
page.insert_text((50, y), text, fontsize=12) |
|
|
y += 20 |
|
|
|
|
|
if ocr_text: |
|
|
page.insert_text((50, y + 20), ocr_text, fontsize=12, color=(1, 0, 0)) |
|
|
|
|
|
output_pdf = io.BytesIO() |
|
|
doc.save(output_pdf) |
|
|
output_pdf.seek(0) |
|
|
|
|
|
return send_file(output_pdf, as_attachment=True, download_name='edited_output.pdf', mimetype='application/pdf') |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
app.run(host="0.0.0.0", port=7860) |
|
|
|