Spaces:

iammraat
/

docTR

Running

App Files Files Community

iammraat commited on Jan 23

Commit

00ef556

verified ·

1 Parent(s): 2a2ba6e

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -63

app.py CHANGED Viewed

@@ -94,81 +94,258 @@
 import gradio as gr
 import numpy as np
 import cv2
 import traceback
 import tempfile
 import os
-from PIL import Image
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
-from transformers import pipeline
 # ------------------------------------------------------
-# 1. Load Models Globally
 # ------------------------------------------------------
 print("⏳ Loading models...")
 # A. Load DocTR (OCR)
 try:
-    # 'fast_base' is lightweight for CPU
     ocr_model = ocr_predictor(det_arch='fast_base', reco_arch='crnn_vgg16_bn', pretrained=True)
     print("✅ DocTR loaded.")
 except Exception as e:
     print(f"❌ DocTR Load Error: {e}")
     raise e
-# B. Load Corrector (Small Language Model)
 try:
-    # 'google/flan-t5-small' is ~250MB, well under the 1GB limit.
-    # We use a text2text-generation pipeline.
-    corrector = pipeline(
-        "text2text-generation",
-        model="google/flan-t5-small",
-        device=-1 # -1 forces CPU
     )
-    print("✅ Correction model (Flan-T5-Small) loaded.")
 except Exception as e:
-    print(f"❌ Corrector Load Error: {e}")
-    corrector = None
 # ------------------------------------------------------
-# 2. Correction Logic
 # ------------------------------------------------------
 def smart_correction(text):
-    if not text or not text.strip() or corrector is None:
         return text
-    # DocTR returns text with newlines. LLMs often prefer line-by-line or chunked input
-    # if the context isn't massive. For a small model, processing line-by-line is safer.
-    lines = text.split('\n')
-    corrected_lines = []
-    print("--- Starting Correction ---")
-    for line in lines:
-        if len(line.strip()) < 3: # Skip empty/tiny lines
-            corrected_lines.append(line)
-            continue
-        try:
-            # Prompt engineering for Flan-T5
-            prompt = f"Fix grammar and OCR errors: {line}"
-            # max_length ensures it doesn't ramble.
-            result = corrector(prompt, max_length=128)
-            fixed_text = result[0]['generated_text']
-            # Fallback: if model returns empty, keep original
-            corrected_lines.append(fixed_text if fixed_text else line)
-        except Exception as e:
-            print(f"Correction failed for line '{line}': {e}")
-            corrected_lines.append(line)
-    return "\n".join(corrected_lines)
 # ------------------------------------------------------
-# 3. Main Processing Function
 # ------------------------------------------------------
 def run_ocr(input_image):
     tmp_path = None
@@ -176,22 +353,21 @@ def run_ocr(input_image):
         if input_image is None:
             return None, "No image uploaded", None, None
-        # -- Save temp file for DocTR robustness --
         with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
             input_image.save(tmp.name)
             tmp_path = tmp.name
-        # -- Run OCR --
         doc = DocumentFile.from_images(tmp_path)
         result = ocr_model(doc)
-        # -- Raw Text --
         raw_text = result.render()
-        # -- Correction Step --
         corrected_text = smart_correction(raw_text)
-        # -- Visualization --
         image_np = np.array(input_image)
         viz_image = image_np.copy()
@@ -209,40 +385,36 @@ def run_ocr(input_image):
     except Exception as e:
         error_log = traceback.format_exc()
-        return None, f"Error: {e}", f"Error Log:\n{error_log}", {"error": str(e)}
     finally:
         if tmp_path and os.path.exists(tmp_path):
             os.remove(tmp_path)
 # ------------------------------------------------------
-# 4. Gradio UI
 # ------------------------------------------------------
-with gr.Blocks(title="DocTR OCR + Correction") as demo:
-    gr.Markdown("## 📄 AI OCR with Grammar Correction")
-    gr.Markdown("Using `DocTR` for extraction and `Flan-T5-Small` for correction.")
     with gr.Row():
         input_img = gr.Image(type="pil", label="Upload Document")
     with gr.Row():
-        btn = gr.Button("Run Extraction & Correction", variant="primary")
     with gr.Row():
         out_img = gr.Image(label="Detections")
     with gr.Row():
-        out_raw = gr.Textbox(label="Raw OCR Text", lines=8, placeholder="Raw output appears here...")
-        out_corrected = gr.Textbox(label="✨ Corrected Text", lines=8, placeholder="AI corrected output appears here...")
     with gr.Row():
-        out_json = gr.JSON(label="Full JSON Data")
-    btn.click(
-        fn=run_ocr,
-        inputs=input_img,
-        outputs=[out_img, out_raw, out_corrected, out_json]
-    )
 if __name__ == "__main__":
     demo.launch()

+# import gradio as gr
+# import numpy as np
+# import cv2
+# import traceback
+# import tempfile
+# import os
+# from PIL import Image
+# from doctr.io import DocumentFile
+# from doctr.models import ocr_predictor
+# from transformers import pipeline
+# # ------------------------------------------------------
+# # 1. Load Models Globally
+# # ------------------------------------------------------
+# print("⏳ Loading models...")
+# # A. Load DocTR (OCR)
+# try:
+#     # 'fast_base' is lightweight for CPU
+#     ocr_model = ocr_predictor(det_arch='fast_base', reco_arch='crnn_vgg16_bn', pretrained=True)
+#     print("✅ DocTR loaded.")
+# except Exception as e:
+#     print(f"❌ DocTR Load Error: {e}")
+#     raise e
+# # B. Load Corrector (Small Language Model)
+# try:
+#     # 'google/flan-t5-small' is ~250MB, well under the 1GB limit.
+#     # We use a text2text-generation pipeline.
+#     corrector = pipeline(
+#         "text2text-generation",
+#         model="google/flan-t5-small",
+#         device=-1 # -1 forces CPU
+#     )
+#     print("✅ Correction model (Flan-T5-Small) loaded.")
+# except Exception as e:
+#     print(f"❌ Corrector Load Error: {e}")
+#     corrector = None
+# # ------------------------------------------------------
+# # 2. Correction Logic
+# # ------------------------------------------------------
+# def smart_correction(text):
+#     if not text or not text.strip() or corrector is None:
+#         return text
+#     # DocTR returns text with newlines. LLMs often prefer line-by-line or chunked input
+#     # if the context isn't massive. For a small model, processing line-by-line is safer.
+#     lines = text.split('\n')
+#     corrected_lines = []
+#     print("--- Starting Correction ---")
+#     for line in lines:
+#         if len(line.strip()) < 3: # Skip empty/tiny lines
+#             corrected_lines.append(line)
+#             continue
+#         try:
+#             # Prompt engineering for Flan-T5
+#             prompt = f"Fix grammar and OCR errors: {line}"
+#             # max_length ensures it doesn't ramble.
+#             result = corrector(prompt, max_length=128)
+#             fixed_text = result[0]['generated_text']
+#             # Fallback: if model returns empty, keep original
+#             corrected_lines.append(fixed_text if fixed_text else line)
+#         except Exception as e:
+#             print(f"Correction failed for line '{line}': {e}")
+#             corrected_lines.append(line)
+#     return "\n".join(corrected_lines)
+# # ------------------------------------------------------
+# # 3. Main Processing Function
+# # ------------------------------------------------------
+# def run_ocr(input_image):
+#     tmp_path = None
+#     try:
+#         if input_image is None:
+#             return None, "No image uploaded", None, None
+#         # -- Save temp file for DocTR robustness --
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
+#             input_image.save(tmp.name)
+#             tmp_path = tmp.name
+#         # -- Run OCR --
+#         doc = DocumentFile.from_images(tmp_path)
+#         result = ocr_model(doc)
+#         # -- Raw Text --
+#         raw_text = result.render()
+#         # -- Correction Step --
+#         corrected_text = smart_correction(raw_text)
+#         # -- Visualization --
+#         image_np = np.array(input_image)
+#         viz_image = image_np.copy()
+#         for page in result.pages:
+#             for block in page.blocks:
+#                 for line in block.lines:
+#                     for word in line.words:
+#                         h, w = viz_image.shape[:2]
+#                         (x_min, y_min), (x_max, y_max) = word.geometry
+#                         x1, y1 = int(x_min * w), int(y_min * h)
+#                         x2, y2 = int(x_max * w), int(y_max * h)
+#                         cv2.rectangle(viz_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
+#         return viz_image, raw_text, corrected_text, result.export()
+#     except Exception as e:
+#         error_log = traceback.format_exc()
+#         return None, f"Error: {e}", f"Error Log:\n{error_log}", {"error": str(e)}
+#     finally:
+#         if tmp_path and os.path.exists(tmp_path):
+#             os.remove(tmp_path)
+# # ------------------------------------------------------
+# # 4. Gradio UI
+# # ------------------------------------------------------
+# with gr.Blocks(title="DocTR OCR + Correction") as demo:
+#     gr.Markdown("## 📄 AI OCR with Grammar Correction")
+#     gr.Markdown("Using `DocTR` for extraction and `Flan-T5-Small` for correction.")
+#     with gr.Row():
+#         input_img = gr.Image(type="pil", label="Upload Document")
+#     with gr.Row():
+#         btn = gr.Button("Run Extraction & Correction", variant="primary")
+#     with gr.Row():
+#         out_img = gr.Image(label="Detections")
+#     with gr.Row():
+#         out_raw = gr.Textbox(label="Raw OCR Text", lines=8, placeholder="Raw output appears here...")
+#         out_corrected = gr.Textbox(label="✨ Corrected Text", lines=8, placeholder="AI corrected output appears here...")
+#     with gr.Row():
+#         out_json = gr.JSON(label="Full JSON Data")
+#     btn.click(
+#         fn=run_ocr,
+#         inputs=input_img,
+#         outputs=[out_img, out_raw, out_corrected, out_json]
+#     )
+# if __name__ == "__main__":
+#     demo.launch()
 import gradio as gr
 import numpy as np
 import cv2
 import traceback
 import tempfile
 import os
+import torch
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
+from transformers import AutoModelForCausalLM, AutoTokenizer
 # ------------------------------------------------------
+# 1. Configuration & Global Loading
 # ------------------------------------------------------
 print("⏳ Loading models...")
 # A. Load DocTR (OCR)
 try:
     ocr_model = ocr_predictor(det_arch='fast_base', reco_arch='crnn_vgg16_bn', pretrained=True)
     print("✅ DocTR loaded.")
 except Exception as e:
     print(f"❌ DocTR Load Error: {e}")
     raise e
+# B. Load LLM (Qwen2.5-7B-Instruct)
+# With 50GB RAM, we can load this comfortably.
+# If it is too slow, change MODEL_ID to "Qwen/Qwen2.5-3B-Instruct" or "Qwen/Qwen2.5-1.5B-Instruct"
+MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
 try:
+    print(f"⬇️ Downloading & Loading {MODEL_ID}...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    llm_model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype="auto",
+        device_map="cpu"  # Uses your 50GB System RAM
     )
+    print(f"✅ {MODEL_ID} loaded successfully.")
 except Exception as e:
+    print(f"❌ LLM Load Error: {e}")
+    llm_model = None
+    tokenizer = None
 # ------------------------------------------------------
+# 2. Correction Logic (The "Smart" Fix)
 # ------------------------------------------------------
 def smart_correction(text):
+    if not text or not llm_model:
         return text
+    print("--- Starting AI Correction ---")
+    # 1. Construct the Prompt
+    # We ask the model to act as a text editor.
+    system_prompt = "You are a helpful assistant that corrects OCR text. Fix typos, capitalization, and grammar. Maintain the original line structure. Do not add any conversational text like 'Here is the corrected text'."
+    user_prompt = f"Correct the following OCR text:\n\n{text}"
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt}
+    ]
+    text_input = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    model_inputs = tokenizer([text_input], return_tensors="pt").to("cpu")
+    # 2. Run Inference
+    # max_new_tokens limits the output length to avoid infinite loops
+    generated_ids = llm_model.generate(
+        model_inputs.input_ids,
+        max_new_tokens=1024,
+        temperature=0.1, # Low temp for factual/consistent results
+        do_sample=False  # Greedy decoding is faster and more deterministic
+    )
+    # 3. Decode Output
+    # We strip the input tokens to get only the new (corrected) text
+    generated_ids = [
+        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+    ]
+    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return response
 # ------------------------------------------------------
+# 3. Processing Pipeline
 # ------------------------------------------------------
 def run_ocr(input_image):
     tmp_path = None
         if input_image is None:
             return None, "No image uploaded", None, None
+        # Robust Temp File Handling
         with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
             input_image.save(tmp.name)
             tmp_path = tmp.name
+        # 1. Run OCR
         doc = DocumentFile.from_images(tmp_path)
         result = ocr_model(doc)
         raw_text = result.render()
+        # 2. Run AI Correction
+        # We pass the WHOLE text block at once. Context helps the AI.
         corrected_text = smart_correction(raw_text)
+        # 3. Visualization
         image_np = np.array(input_image)
         viz_image = image_np.copy()
     except Exception as e:
         error_log = traceback.format_exc()
+        return None, f"Error: {e}", f"Logs:\n{error_log}", {"error": str(e)}
     finally:
         if tmp_path and os.path.exists(tmp_path):
             os.remove(tmp_path)
 # ------------------------------------------------------
+# 4. Gradio Interface
 # ------------------------------------------------------
+with gr.Blocks(title="Next-Gen OCR") as demo:
+    gr.Markdown("## 📄 Next-Gen AI OCR")
+    gr.Markdown(f"Using **DocTR** for extraction and **{MODEL_ID}** for smart correction.")
     with gr.Row():
         input_img = gr.Image(type="pil", label="Upload Document")
     with gr.Row():
+        btn = gr.Button("Run Extraction & Smart Correction", variant="primary")
     with gr.Row():
         out_img = gr.Image(label="Detections")
     with gr.Row():
+        out_raw = gr.Textbox(label="Raw OCR Output", lines=10)
+        out_corrected = gr.Textbox(label="🤖 AI Corrected (Qwen 7B)", lines=10)
     with gr.Row():
+        out_json = gr.JSON(label="JSON Data")
+    btn.click(fn=run_ocr, inputs=input_img, outputs=[out_img, out_raw, out_corrected, out_json])
 if __name__ == "__main__":
     demo.launch()