Spaces:

akashraut
/

docAI

Sleeping

App Files Files Community

akashraut commited on Feb 9

Commit

cdacb08

verified ·

1 Parent(s): d4bebd2

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -47

app.py CHANGED Viewed

@@ -2,77 +2,75 @@ import gradio as gr
 import torch
 import json
 from PIL import Image
-from transformers import AutoProcessor, AutoModel
-MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Processor
-processor = AutoProcessor.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True
-)
-# Model (REMOTE CODE LOAD — critical)
-model = AutoModel.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
-    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
     device_map="auto"
 )
 model.eval()
 def extract_document(image: Image.Image):
-    prompt = """
-You are a universal document understanding AI.
-Return ONLY valid JSON.
-Extract:
-- document_type
-- key-value fields
-- tables with rows and columns
-Be document-agnostic.
-Do not hallucinate.
-"""
     inputs = processor(
-        images=image,
-        text=prompt,
-        return_tensors="pt"
-    ).to(model.device)
     with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=2048,
-            temperature=0.0
-        )
-    text = processor.decode(outputs[0], skip_special_tokens=True)
     try:
         start = text.find("{")
         end = text.rfind("}") + 1
         return json.loads(text[start:end])
     except Exception:
-        return {
-            "error": "Model output could not be parsed",
-            "raw_output": text
-        }
 with gr.Blocks() as demo:
     gr.Markdown("# 📄 DocAI — Universal Document Intelligence")
-    image = gr.Image(type="pil", label="Upload document")
-    output = gr.JSON(label="Extracted JSON")
-    gr.Button("Extract").click(
-        extract_document,
-        inputs=image,
-        outputs=output
-    )
-demo.launch()

 import torch
 import json
 from PIL import Image
+from transformers import AutoProcessor, AutoModelForVision2Seq
+# RECOMMENDATION: If on free CPU space, use "Qwen/Qwen2-VL-2B-Instruct"
+# to avoid Out-Of-Memory crashes.
+MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Processor
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+# Model
+model = AutoModelForVision2Seq.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
+    # bfloat16 is better for Qwen and uses half the memory of float32
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    low_cpu_mem_usage=True,
     device_map="auto"
 )
 model.eval()
 def extract_document(image: Image.Image):
+    if image is None:
+        return {"error": "No image uploaded"}
+    prompt = "<|im_start|>system\nYou are a universal document understanding AI. Return ONLY valid JSON.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Extract document_type, key-value fields, and tables from this document.<|im_end|>\n<|im_start|>assistant\n"
+    # Process image and text
     inputs = processor(
+        text=[prompt],
+        images=[image],
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+    # Generate
     with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=1024)
+    # Trim the input tokens from the output
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
     try:
+        # Extract JSON block
         start = text.find("{")
         end = text.rfind("}") + 1
         return json.loads(text[start:end])
     except Exception:
+        return {"raw_output": text}
 with gr.Blocks() as demo:
     gr.Markdown("# 📄 DocAI — Universal Document Intelligence")
+    gr.Markdown("Using Qwen2.5-VL for structured document extraction.")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Upload document")
+            btn = gr.Button("Extract Data", variant="primary")
+        with gr.Column():
+            output_json = gr.JSON(label="Extracted JSON")
+    btn.click(extract_document, inputs=image_input, outputs=output_json)
+demo.launch()