mike23415 commited on
Commit
c770f0d
·
verified ·
1 Parent(s): 8bd7c7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -44
app.py CHANGED
@@ -1,68 +1,85 @@
1
- from flask import Flask, request, jsonify
2
  from flask_cors import CORS
3
  import fitz # PyMuPDF
4
- from pdf2image import convert_from_bytes
5
  import pytesseract
6
- from PIL import Image
7
  import io
8
 
9
  app = Flask(__name__)
10
  CORS(app)
11
 
12
- @app.route('/')
13
- def home():
14
- return '✅ PDF Editor Backend is running.'
15
-
16
  @app.route('/upload', methods=['POST'])
17
- def upload():
18
  if 'pdf' not in request.files:
19
  return jsonify({'error': 'No PDF uploaded'}), 400
20
 
21
- file = request.files['pdf']
22
- pdf_bytes = file.read()
23
-
24
- # Convert PDF pages to images for OCR
25
- images = convert_from_bytes(pdf_bytes)
26
- ocr_results = []
27
- digital_text_results = []
28
 
29
- # Extract digital text with layout
30
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
31
-
32
- for page_num, page in enumerate(doc):
33
- # ----- Extract digital text -----
34
  blocks = page.get_text("dict")["blocks"]
35
- digital_text = []
36
  for b in blocks:
37
- if b['type'] == 0: # text block
38
- for line in b['lines']:
39
- for span in line['spans']:
40
- digital_text.append({
41
- "text": span['text'],
42
- "bbox": span['bbox'],
43
- "font": span.get('font', ''),
44
- "size": span.get('size', 0)
45
- })
46
- digital_text_results.append({
47
- "page": page_num + 1,
48
- "digital_text": digital_text
49
  })
50
 
51
- # ----- Extract OCR from image -----
52
- img_byte_arr = io.BytesIO()
53
- images[page_num].save(img_byte_arr, format='PNG')
54
- img_byte_arr = img_byte_arr.getvalue()
55
-
56
- ocr_text = pytesseract.image_to_string(Image.open(io.BytesIO(img_byte_arr)))
57
- ocr_results.append({
58
- "page": page_num + 1,
59
- "ocr_text": ocr_text
60
  })
61
 
62
  return jsonify({
63
- "ocr": ocr_results,
64
- "digital": digital_text_results
65
  })
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  if __name__ == '__main__':
68
- app.run(host='0.0.0.0', port=7860)
 
1
+ from flask import Flask, request, jsonify, send_file
2
  from flask_cors import CORS
3
  import fitz # PyMuPDF
 
4
  import pytesseract
5
+ from pdf2image import convert_from_bytes
6
  import io
7
 
8
  app = Flask(__name__)
9
  CORS(app)
10
 
 
 
 
 
11
  @app.route('/upload', methods=['POST'])
12
+ def upload_pdf():
13
  if 'pdf' not in request.files:
14
  return jsonify({'error': 'No PDF uploaded'}), 400
15
 
16
+ pdf_file = request.files['pdf']
17
+ pdf_bytes = pdf_file.read()
 
 
 
 
 
18
 
19
+ # Digital text extraction
20
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
21
+ digital = []
22
+ for page in doc:
 
23
  blocks = page.get_text("dict")["blocks"]
24
+ text_blocks = []
25
  for b in blocks:
26
+ if "lines" in b:
27
+ text = " ".join(span["text"] for line in b["lines"] for span in line["spans"])
28
+ text_blocks.append({"text": text})
29
+ digital.append({
30
+ "page": page.number + 1,
31
+ "digital_text": text_blocks
 
 
 
 
 
 
32
  })
33
 
34
+ # OCR extraction
35
+ images = convert_from_bytes(pdf_bytes)
36
+ ocr = []
37
+ for img in images:
38
+ text = pytesseract.image_to_string(img)
39
+ ocr.append({
40
+ "ocr_text": text
 
 
41
  })
42
 
43
  return jsonify({
44
+ "digital": digital,
45
+ "ocr": ocr
46
  })
47
 
48
+
49
+ @app.route('/rebuild', methods=['POST'])
50
+ def rebuild_pdf():
51
+ if 'pdf' not in request.files:
52
+ return jsonify({'error': 'PDF is required'}), 400
53
+ if 'edits' not in request.form:
54
+ return jsonify({'error': 'Edits are required'}), 400
55
+
56
+ pdf_file = request.files['pdf']
57
+ edits = eval(request.form['edits']) # NOTE: In production, use json.loads
58
+
59
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
60
+
61
+ for page_data in edits:
62
+ page_num = page_data['pageNumber']
63
+ digital_blocks = page_data.get('digitalText', [])
64
+ ocr_text = page_data.get('ocrText', '')
65
+
66
+ page = doc[page_num - 1]
67
+ y = 100
68
+
69
+ for block in digital_blocks:
70
+ text = block.get('text', '')
71
+ page.insert_text((50, y), text, fontsize=12)
72
+ y += 20
73
+
74
+ if ocr_text:
75
+ page.insert_text((50, y + 20), ocr_text, fontsize=12, color=(1, 0, 0))
76
+
77
+ output_pdf = io.BytesIO()
78
+ doc.save(output_pdf)
79
+ output_pdf.seek(0)
80
+
81
+ return send_file(output_pdf, as_attachment=True, download_name='edited_output.pdf', mimetype='application/pdf')
82
+
83
+
84
  if __name__ == '__main__':
85
+ app.run(host="0.0.0.0", port=7860)