Spaces:

Gagandeep12
/

extopen-src

Sleeping

App Files Files Community

Gagandeep12 commited on Sep 24, 2025

Commit

1e39f6a

verified ·

1 Parent(s): 2234729

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -106

app.py CHANGED Viewed

@@ -1,112 +1,130 @@
-from flask import Flask, request, jsonify, render_template
-from flask_cors import CORS
-import pytesseract
-import numpy as np
-from PIL import Image
-import fitz
-import io
-import easyocr
 import os
-app = Flask(__name__)
-CORS(app)
-# ==============================
-# EasyOCR Setup (use /tmp to avoid permission errors)
-# ==============================
-EASY_OCR_DIR = os.path.join("/tmp", ".EasyOCR")
-os.makedirs(EASY_OCR_DIR, exist_ok=True)
-# Initialize EasyOCR reader once (English + Hindi)
-reader = easyocr.Reader(
-    ['en', 'hi'],
-    gpu=False,
-    model_storage_directory=EASY_OCR_DIR,
-    user_network_directory=os.path.join(EASY_OCR_DIR, "user_network")
-)
-@app.route('/')
-def home():
-    return render_template('index.html')
-@app.route('/extract', methods=['POST'])
-def extract_text():
-    file = request.files.get('file')
-    method = request.form.get('method', 'tesseract')  # default: tesseract
-    if not file:
-        return jsonify({'error': 'No file uploaded'}), 400
     try:
-        filename = file.filename.lower()
-        print(f"Received file: {filename} with method: {method}")  # Debug
-        if filename.endswith('.pdf'):
-            return jsonify({'text': extract_text_from_pdf(file)})
-        elif filename.endswith(('.png', '.jpg', '.jpeg')):
-            if method == 'easyocr':
-                return jsonify({'text': extract_text_with_easyocr(file)})
-            else:
-                return jsonify({'text': extract_text_from_image(file)})
         else:
-            return jsonify({'error': 'Unsupported file format'}), 400
     except Exception as e:
-        print(f"Error processing file: {str(e)}")  # Debug
-        return jsonify({'error': str(e)}), 500
-def extract_text_from_pdf(file):
-    text = ""
-    pdf_bytes = file.read()
-    pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
-    for page_number, page in enumerate(pdf):
-        try:
-            page_text = page.get_text()
-            if page_text.strip():
-                text += f"\n--- Page {page_number + 1} ---\n"
-                text += page_text + "\n"
-            else:
-                pix = page.get_pixmap(dpi=300)
-                image_bytes = pix.tobytes("png")
-                image = Image.open(io.BytesIO(image_bytes))
-                ocr_text = pytesseract.image_to_string(
-                    image, lang='hin+eng', config='--oem 3 --psm 6'
-                )
-                text += f"\n--- Page {page_number + 1} (OCR) ---\n"
-                text += ocr_text + "\n"
-        except Exception as e:
-            text += f"\n--- Page {page_number + 1} (Error) ---\nError extracting text: {str(e)}\n"
-            continue
-    return text.strip()
-def extract_text_from_image(file, method='tesseract'):
-    image = Image.open(io.BytesIO(file.read())).convert("RGB")
-    if method == 'easyocr':
-        image_np = np.array(image)
-        result = reader.readtext(image_np, detail=0)
-        return '\n'.join(result)
-    else:
-        custom_config = r'--oem 3 --psm 6'
-        return pytesseract.image_to_string(
-            image, lang='hin+eng', config=custom_config
-        )
-def extract_text_with_easyocr(file):
-    image = Image.open(io.BytesIO(file.read())).convert("RGB")
-    image_np = np.array(image)
-    result = reader.readtext(image_np)
-    sorted_result = sorted(result, key=lambda x: x[0][0][1])  # sort by top y
-    extracted_text = "\n".join([text[1] for text in sorted_result])
-    return extracted_text
-if __name__ == '__main__':
     app.run(host="0.0.0.0", port=7860, debug=True)

 import os
+import tempfile
+import time
+import requests
+from flask import Flask, request, jsonify, send_from_directory
+from werkzeug.utils import secure_filename
+from PyPDF2 import PdfReader, PdfWriter
+from dotenv import load_dotenv
+# Load env
+load_dotenv()
+AZURE_ENDPOINT = os.environ.get("AZURE_ENDPOINT")
+AZURE_KEY = os.environ.get("AZURE_KEY")
+if not AZURE_ENDPOINT or not AZURE_KEY:
+    raise RuntimeError("Set AZURE_ENDPOINT and AZURE_KEY in .env")
+AZURE_ENDPOINT = AZURE_ENDPOINT.rstrip("/")
+UPLOAD_DIR = "/tmp/uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "pdf"}
+app = Flask(__name__, static_folder="static", static_url_path="/static")
+# --- Helpers ---
+def allowed_file(filename):
+    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+def read_file_bytes(path):
+    with open(path, "rb") as f:
+        return f.read()
+def submit_read_api(file_path):
+    """Submit file to Computer Vision Read API"""
+    url = f"{AZURE_ENDPOINT}/vision/v3.2/read/analyze"
+    headers = {
+        "Ocp-Apim-Subscription-Key": AZURE_KEY,
+        "Content-Type": "application/octet-stream"
+    }
+    data = read_file_bytes(file_path)
+    resp = requests.post(url, headers=headers, data=data)
+    resp.raise_for_status()
+    op_location = resp.headers.get("Operation-Location")
+    if not op_location:
+        raise RuntimeError(f"No Operation-Location header. Response: {resp.text}")
+    return op_location
+def poll_read_result(operation_location, timeout=180, interval=2.0):
+    """Poll until OCR is finished"""
+    headers = {"Ocp-Apim-Subscription-Key": AZURE_KEY}
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        r = requests.get(operation_location, headers=headers)
+        r.raise_for_status()
+        j = r.json()
+        status = j.get("status", "").lower()
+        if status in ("succeeded", "failed"):
+            break
+        time.sleep(interval)
+    if status != "succeeded":
+        raise RuntimeError(f"OCR failed. Status={status}, Response={j}")
+    results = j.get("analyzeResult", {})
+    lines = []
+    for read_result in results.get("readResults", []):
+        for line in read_result.get("lines", []):
+            lines.append(line["text"])
+    return "\n".join(lines)
+def split_pdf_into_chunks(pdf_path, chunk_size=2):
+    reader = PdfReader(pdf_path)
+    total_pages = len(reader.pages)
+    chunk_files = []
+    for start in range(0, total_pages, chunk_size):
+        writer = PdfWriter()
+        for p in range(start, min(start + chunk_size, total_pages)):
+            writer.add_page(reader.pages[p])
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        with open(tmp.name, "wb") as f:
+            writer.write(f)
+        chunk_files.append(tmp.name)
+    return chunk_files
+# --- Routes ---
+@app.route("/")
+def index():
+    return send_from_directory("static", "index.html")
+@app.route("/upload", methods=["POST"])
+def upload():
+    if "file" not in request.files:
+        return jsonify({"error": "No file part"}), 400
+    file = request.files["file"]
+    if file.filename == "":
+        return jsonify({"error": "Empty filename"}), 400
+    if not allowed_file(file.filename):
+        return jsonify({"error": "File type not allowed"}), 400
+    filename = secure_filename(file.filename)
+    path = os.path.join(UPLOAD_DIR, filename)
+    file.save(path)
     try:
+        if filename.lower().endswith(".pdf"):
+            chunks = split_pdf_into_chunks(path, chunk_size=2)
+            merged_results = []
+            for chunk_file in chunks:
+                op_location = submit_read_api(chunk_file)
+                chunk_text = poll_read_result(op_location)
+                merged_results.append(chunk_text)
+            extracted_text = "\n\n".join(merged_results)
         else:
+            op_location = submit_read_api(path)
+            extracted_text = poll_read_result(op_location)
     except Exception as e:
+        return jsonify({"error": "OCR failed", "details": str(e)}), 500
+    return jsonify({"text": extracted_text})
+if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=True)