File size: 2,406 Bytes
c770f0d
7ef5638
 
 
c770f0d
7ef5638
 
 
 
 
 
c770f0d
7ef5638
 
 
c770f0d
 
7ef5638
c770f0d
7ef5638
c770f0d
 
7ef5638
c770f0d
7ef5638
c770f0d
 
 
 
 
 
7ef5638
 
c770f0d
 
 
 
 
 
 
7ef5638
 
 
c770f0d
 
7ef5638
 
c770f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ef5638
c770f0d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_bytes
import io

app = Flask(__name__)
CORS(app)

@app.route('/upload', methods=['POST'])
def upload_pdf():
    if 'pdf' not in request.files:
        return jsonify({'error': 'No PDF uploaded'}), 400

    pdf_file = request.files['pdf']
    pdf_bytes = pdf_file.read()

    # Digital text extraction
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    digital = []
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        text_blocks = []
        for b in blocks:
            if "lines" in b:
                text = " ".join(span["text"] for line in b["lines"] for span in line["spans"])
                text_blocks.append({"text": text})
        digital.append({
            "page": page.number + 1,
            "digital_text": text_blocks
        })

    # OCR extraction
    images = convert_from_bytes(pdf_bytes)
    ocr = []
    for img in images:
        text = pytesseract.image_to_string(img)
        ocr.append({
            "ocr_text": text
        })

    return jsonify({
        "digital": digital,
        "ocr": ocr
    })


@app.route('/rebuild', methods=['POST'])
def rebuild_pdf():
    if 'pdf' not in request.files:
        return jsonify({'error': 'PDF is required'}), 400
    if 'edits' not in request.form:
        return jsonify({'error': 'Edits are required'}), 400

    pdf_file = request.files['pdf']
    edits = eval(request.form['edits'])  # NOTE: In production, use json.loads

    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")

    for page_data in edits:
        page_num = page_data['pageNumber']
        digital_blocks = page_data.get('digitalText', [])
        ocr_text = page_data.get('ocrText', '')

        page = doc[page_num - 1]
        y = 100

        for block in digital_blocks:
            text = block.get('text', '')
            page.insert_text((50, y), text, fontsize=12)
            y += 20

        if ocr_text:
            page.insert_text((50, y + 20), ocr_text, fontsize=12, color=(1, 0, 0))

    output_pdf = io.BytesIO()
    doc.save(output_pdf)
    output_pdf.seek(0)

    return send_file(output_pdf, as_attachment=True, download_name='edited_output.pdf', mimetype='application/pdf')


if __name__ == '__main__':
    app.run(host="0.0.0.0", port=7860)