Spaces:

Gagandeep12
/

extopen-src

Sleeping

App Files Files Community

Gagandeep12 commited on Sep 27, 2025

Commit

05a982f

verified ·

1 Parent(s): 4f9dba0

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -13

app.py CHANGED Viewed

@@ -1,13 +1,21 @@
 import os
 import time
 from flask import Flask, request, jsonify, send_from_directory
 from werkzeug.utils import secure_filename
 from dotenv import load_dotenv
-from azure_ocr import submit_read_api, poll_read_result, split_pdf_into_chunks, clean_extracted_text
 # Load env
 load_dotenv()
 UPLOAD_DIR = "/tmp/uploads"
 os.makedirs(UPLOAD_DIR, exist_ok=True)
@@ -20,13 +28,72 @@ app = Flask(__name__, static_folder="static", static_url_path="/static")
 def allowed_file(filename):
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
 # --- Routes ---
 @app.route("/")
 def index():
     return send_from_directory("static", "index.html")
 @app.route("/upload", methods=["POST"])
 def upload():
     if "file" not in request.files:
@@ -45,23 +112,14 @@ def upload():
         if filename.lower().endswith(".pdf"):
             chunks = split_pdf_into_chunks(path, chunk_size=2)
             merged_results = []
-            for idx, chunk_file in enumerate(chunks, 1):
-                print(f"📄 Processing chunk {idx}/{len(chunks)}")
                 op_location = submit_read_api(chunk_file)
                 chunk_text = poll_read_result(op_location)
                 merged_results.append(chunk_text)
-                # throttle between requests
-                time.sleep(2)
             extracted_text = "\n\n".join(merged_results)
-        else:  # images
             op_location = submit_read_api(path)
             extracted_text = poll_read_result(op_location)
-        # cleanup text
-        extracted_text = clean_extracted_text(extracted_text)
     except Exception as e:
         return jsonify({"error": "OCR failed", "details": str(e)}), 500

 import os
+import tempfile
 import time
+import requests
 from flask import Flask, request, jsonify, send_from_directory
 from werkzeug.utils import secure_filename
+from PyPDF2 import PdfReader, PdfWriter
 from dotenv import load_dotenv
 # Load env
 load_dotenv()
+AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
+AZURE_KEY = os.environ.get("AZURE_KEY")
+if not AZURE_ENDPOINT or not AZURE_KEY:
+    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
+AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
 UPLOAD_DIR = "/tmp/uploads"
 os.makedirs(UPLOAD_DIR, exist_ok=True)
 def allowed_file(filename):
     return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+def read_file_bytes(path):
+    with open(path, "rb") as f:
+        return f.read()
+def submit_read_api(file_path):
+    """Submit file to Computer Vision Read API"""
+    url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
+    headers = {
+        "Ocp-Apim-Subscription-Key": AZURE_KEY,
+        "Content-Type": "application/octet-stream"
+    }
+    data = read_file_bytes(file_path)
+    resp = requests.post(url, headers=headers, data=data)
+    resp.raise_for_status()
+    op_location = resp.headers.get("Operation-Location")
+    if not op_location:
+        raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
+    return op_location
+def poll_read_result(operation_location, timeout=180, interval=2.0):
+    """Poll until OCR is finished"""
+    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        r = requests.get(operation_location, headers=headers)
+        r.raise_for_status()
+        j = r.json()
+        status = j.get("status", "").lower()
+        if status in ("succeeded", "failed"):
+            break
+        time.sleep(interval)
+    if status != "succeeded":
+        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
+    results = j.get("analyzeResult", {})
+    lines = []
+    for read_result in results.get("readResults", []):
+        for line in read_result.get("lines", []):
+            lines.append(line["text"])
+    return "\n".join(lines)
+def split_pdf_into_chunks(pdf_path, chunk_size=2):
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+    chunk_files = []
+    for start in range(0, total_pages, chunk_size):
+        writer = PdfWriter()
+        for p in range(start, min(start + chunk_size, total_pages)):
+            writer.add_page(reader.pages[p])
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        with open(tmp.name, "wb") as f:
+            writer.write(f)
+        chunk_files.append(tmp.name)
+    return chunk_files
 # --- Routes ---
 @app.route("/")
 def index():
     return send_from_directory("static", "index.html")
 @app.route("/upload", methods=["POST"])
 def upload():
     if "file" not in request.files:
         if filename.lower().endswith(".pdf"):
             chunks = split_pdf_into_chunks(path, chunk_size=2)
             merged_results = []
+            for chunk_file in chunks:
                 op_location = submit_read_api(chunk_file)
                 chunk_text = poll_read_result(op_location)
                 merged_results.append(chunk_text)
             extracted_text = "\n\n".join(merged_results)
+        else:
             op_location = submit_read_api(path)
             extracted_text = poll_read_result(op_location)
     except Exception as e:
         return jsonify({"error": "OCR failed", "details": str(e)}), 500