Spaces:

iammraat
/

docTR

Sleeping

App Files Files Community

iammraat commited on Jan 23

Commit

79ff71f

verified ·

1 Parent(s): 6bf2b4d

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -29

app.py CHANGED Viewed

@@ -256,6 +256,184 @@
 import gradio as gr
 import numpy as np
 import cv2
@@ -280,9 +458,8 @@ except Exception as e:
     print(f"❌ DocTR Load Error: {e}")
     raise e
-# B. Load LLM (Qwen2.5-7B-Instruct)
-# With 50GB RAM, we can load this comfortably.
-# If it is too slow, change MODEL_ID to "Qwen/Qwen2.5-3B-Instruct" or "Qwen/Qwen2.5-1.5B-Instruct"
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
 try:
@@ -291,7 +468,7 @@ try:
     llm_model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype="auto",
-        device_map="cpu"  # Uses your 50GB System RAM
     )
     print(f"✅ {MODEL_ID} loaded successfully.")
 except Exception as e:
@@ -300,7 +477,7 @@ except Exception as e:
     tokenizer = None
 # ------------------------------------------------------
-# 2. Correction Logic (The "Smart" Fix)
 # ------------------------------------------------------
 def smart_correction(text):
     if not text or not llm_model:
@@ -309,8 +486,13 @@ def smart_correction(text):
     print("--- Starting AI Correction ---")
     # 1. Construct the Prompt
-    # We ask the model to act as a text editor.
-    system_prompt = "You are a helpful assistant that corrects OCR text. Fix typos, capitalization, and grammar. Maintain the original line structure. Do not add any conversational text like 'Here is the corrected text'."
     user_prompt = f"Correct the following OCR text:\n\n{text}"
     messages = [
@@ -318,31 +500,37 @@ def smart_correction(text):
         {"role": "user", "content": user_prompt}
     ]
     text_input = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
     model_inputs = tokenizer([text_input], return_tensors="pt").to("cpu")
     # 2. Run Inference
-    # max_new_tokens limits the output length to avoid infinite loops
-    generated_ids = llm_model.generate(
-        model_inputs.input_ids,
-        max_new_tokens=1024,
-        temperature=0.1, # Low temp for factual/consistent results
-        do_sample=False  # Greedy decoding is faster and more deterministic
-    )
-    # 3. Decode Output
-    # We strip the input tokens to get only the new (corrected) text
-    generated_ids = [
-        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-    ]
-    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return response
 # ------------------------------------------------------
 # 3. Processing Pipeline
@@ -353,7 +541,7 @@ def run_ocr(input_image):
         if input_image is None:
             return None, "No image uploaded", None, None
-        # Robust Temp File Handling
         with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
             input_image.save(tmp.name)
             tmp_path = tmp.name
@@ -364,7 +552,7 @@ def run_ocr(input_image):
         raw_text = result.render()
         # 2. Run AI Correction
-        # We pass the WHOLE text block at once. Context helps the AI.
         corrected_text = smart_correction(raw_text)
         # 3. Visualization
@@ -394,9 +582,9 @@ def run_ocr(input_image):
 # ------------------------------------------------------
 # 4. Gradio Interface
 # ------------------------------------------------------
-with gr.Blocks(title="Next-Gen OCR") as demo:
-    gr.Markdown("## 📄 Next-Gen AI OCR")
-    gr.Markdown(f"Using **DocTR** for extraction and **{MODEL_ID}** for smart correction.")
     with gr.Row():
         input_img = gr.Image(type="pil", label="Upload Document")
@@ -409,7 +597,7 @@ with gr.Blocks(title="Next-Gen OCR") as demo:
     with gr.Row():
         out_raw = gr.Textbox(label="Raw OCR Output", lines=10)
-        out_corrected = gr.Textbox(label="🤖 AI Corrected (Qwen 7B)", lines=10)
     with gr.Row():
         out_json = gr.JSON(label="JSON Data")

+# import gradio as gr
+# import numpy as np
+# import cv2
+# import traceback
+# import tempfile
+# import os
+# import torch
+# from doctr.io import DocumentFile
+# from doctr.models import ocr_predictor
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+# # ------------------------------------------------------
+# # 1. Configuration & Global Loading
+# # ------------------------------------------------------
+# print("⏳ Loading models...")
+# # A. Load DocTR (OCR)
+# try:
+#     ocr_model = ocr_predictor(det_arch='fast_base', reco_arch='crnn_vgg16_bn', pretrained=True)
+#     print("✅ DocTR loaded.")
+# except Exception as e:
+#     print(f"❌ DocTR Load Error: {e}")
+#     raise e
+# # B. Load LLM (Qwen2.5-7B-Instruct)
+# # With 50GB RAM, we can load this comfortably.
+# # If it is too slow, change MODEL_ID to "Qwen/Qwen2.5-3B-Instruct" or "Qwen/Qwen2.5-1.5B-Instruct"
+# MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+# try:
+#     print(f"⬇️ Downloading & Loading {MODEL_ID}...")
+#     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+#     llm_model = AutoModelForCausalLM.from_pretrained(
+#         MODEL_ID,
+#         torch_dtype="auto",
+#         device_map="cpu"  # Uses your 50GB System RAM
+#     )
+#     print(f"✅ {MODEL_ID} loaded successfully.")
+# except Exception as e:
+#     print(f"❌ LLM Load Error: {e}")
+#     llm_model = None
+#     tokenizer = None
+# # ------------------------------------------------------
+# # 2. Correction Logic (The "Smart" Fix)
+# # ------------------------------------------------------
+# def smart_correction(text):
+#     if not text or not llm_model:
+#         return text
+#     print("--- Starting AI Correction ---")
+#     # 1. Construct the Prompt
+#     # We ask the model to act as a text editor.
+#     system_prompt = "You are a helpful assistant that corrects OCR text. Fix typos, capitalization, and grammar. Maintain the original line structure. Do not add any conversational text like 'Here is the corrected text'."
+#     user_prompt = f"Correct the following OCR text:\n\n{text}"
+#     messages = [
+#         {"role": "system", "content": system_prompt},
+#         {"role": "user", "content": user_prompt}
+#     ]
+#     text_input = tokenizer.apply_chat_template(
+#         messages,
+#         tokenize=False,
+#         add_generation_prompt=True
+#     )
+#     model_inputs = tokenizer([text_input], return_tensors="pt").to("cpu")
+#     # 2. Run Inference
+#     # max_new_tokens limits the output length to avoid infinite loops
+#     generated_ids = llm_model.generate(
+#         model_inputs.input_ids,
+#         max_new_tokens=1024,
+#         temperature=0.1, # Low temp for factual/consistent results
+#         do_sample=False  # Greedy decoding is faster and more deterministic
+#     )
+#     # 3. Decode Output
+#     # We strip the input tokens to get only the new (corrected) text
+#     generated_ids = [
+#         output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+#     ]
+#     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+#     return response
+# # ------------------------------------------------------
+# # 3. Processing Pipeline
+# # ------------------------------------------------------
+# def run_ocr(input_image):
+#     tmp_path = None
+#     try:
+#         if input_image is None:
+#             return None, "No image uploaded", None, None
+#         # Robust Temp File Handling
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
+#             input_image.save(tmp.name)
+#             tmp_path = tmp.name
+#         # 1. Run OCR
+#         doc = DocumentFile.from_images(tmp_path)
+#         result = ocr_model(doc)
+#         raw_text = result.render()
+#         # 2. Run AI Correction
+#         # We pass the WHOLE text block at once. Context helps the AI.
+#         corrected_text = smart_correction(raw_text)
+#         # 3. Visualization
+#         image_np = np.array(input_image)
+#         viz_image = image_np.copy()
+#         for page in result.pages:
+#             for block in page.blocks:
+#                 for line in block.lines:
+#                     for word in line.words:
+#                         h, w = viz_image.shape[:2]
+#                         (x_min, y_min), (x_max, y_max) = word.geometry
+#                         x1, y1 = int(x_min * w), int(y_min * h)
+#                         x2, y2 = int(x_max * w), int(y_max * h)
+#                         cv2.rectangle(viz_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
+#         return viz_image, raw_text, corrected_text, result.export()
+#     except Exception as e:
+#         error_log = traceback.format_exc()
+#         return None, f"Error: {e}", f"Logs:\n{error_log}", {"error": str(e)}
+#     finally:
+#         if tmp_path and os.path.exists(tmp_path):
+#             os.remove(tmp_path)
+# # ------------------------------------------------------
+# # 4. Gradio Interface
+# # ------------------------------------------------------
+# with gr.Blocks(title="Next-Gen OCR") as demo:
+#     gr.Markdown("## 📄 Next-Gen AI OCR")
+#     gr.Markdown(f"Using **DocTR** for extraction and **{MODEL_ID}** for smart correction.")
+#     with gr.Row():
+#         input_img = gr.Image(type="pil", label="Upload Document")
+#     with gr.Row():
+#         btn = gr.Button("Run Extraction & Smart Correction", variant="primary")
+#     with gr.Row():
+#         out_img = gr.Image(label="Detections")
+#     with gr.Row():
+#         out_raw = gr.Textbox(label="Raw OCR Output", lines=10)
+#         out_corrected = gr.Textbox(label="🤖 AI Corrected (Qwen 7B)", lines=10)
+#     with gr.Row():
+#         out_json = gr.JSON(label="JSON Data")
+#     btn.click(fn=run_ocr, inputs=input_img, outputs=[out_img, out_raw, out_corrected, out_json])
+# if __name__ == "__main__":
+#     demo.launch()
 import gradio as gr
 import numpy as np
 import cv2
     print(f"❌ DocTR Load Error: {e}")
     raise e
+# B. Load LLM (Qwen2.5-3B-Instruct)
+# 3B fits easily in 18GB RAM (takes ~6GB) allowing space for OS + OCR.
 MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
 try:
     llm_model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         torch_dtype="auto",
+        device_map="cpu"  # Efficiently uses RAM
     )
     print(f"✅ {MODEL_ID} loaded successfully.")
 except Exception as e:
     tokenizer = None
 # ------------------------------------------------------
+# 2. Correction Logic (Context-Aware)
 # ------------------------------------------------------
 def smart_correction(text):
     if not text or not llm_model:
     print("--- Starting AI Correction ---")
     # 1. Construct the Prompt
+    # We explicitly tell it to fix OCR errors and maintain structure.
+    system_prompt = (
+        "You are an expert OCR post-processing assistant. "
+        "Your task is to correct OCR errors, typos, and grammar in the provided text. "
+        "Maintain the original line breaks and layout strictly. "
+        "Do not add any conversational text. Output ONLY the corrected text."
+    )
     user_prompt = f"Correct the following OCR text:\n\n{text}"
     messages = [
         {"role": "user", "content": user_prompt}
     ]
+    # Apply chat template
     text_input = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
+    # Tokenize
     model_inputs = tokenizer([text_input], return_tensors="pt").to("cpu")
     # 2. Run Inference
+    # Greedy decoding (do_sample=False) is faster and prevents "creative" hallucinations.
+    try:
+        generated_ids = llm_model.generate(
+            model_inputs.input_ids,
+            max_new_tokens=1024,
+            temperature=0.1,
+            do_sample=False
+        )
+        # 3. Decode Output
+        # Strip input tokens to get only the new text
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return response
+    except Exception as e:
+        print(f"Inference Error: {e}")
+        return text # Fallback to original if AI fails
 # ------------------------------------------------------
 # 3. Processing Pipeline
         if input_image is None:
             return None, "No image uploaded", None, None
+        # Temp file for robust loading
         with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
             input_image.save(tmp.name)
             tmp_path = tmp.name
         raw_text = result.render()
         # 2. Run AI Correction
+        # The 3B model is fast enough to handle the full page context at once.
         corrected_text = smart_correction(raw_text)
         # 3. Visualization
 # ------------------------------------------------------
 # 4. Gradio Interface
 # ------------------------------------------------------
+with gr.Blocks(title="AI OCR with Qwen 3B") as demo:
+    gr.Markdown("## 📄 Robust AI OCR")
+    gr.Markdown(f"Using **DocTR** for text extraction and **{MODEL_ID}** for intelligent grammar correction.")
     with gr.Row():
         input_img = gr.Image(type="pil", label="Upload Document")
     with gr.Row():
         out_raw = gr.Textbox(label="Raw OCR Output", lines=10)
+        out_corrected = gr.Textbox(label="🤖 AI Corrected (Qwen 3B)", lines=10)
     with gr.Row():
         out_json = gr.JSON(label="JSON Data")