Spaces:

A-M-R-A-G
/

Basira-Qwen-based

Runtime error

App Files Files Community

A-M-R-A-G commited on Feb 25

Commit

574930d

verified ·

1 Parent(s): 28e990c

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -23

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import gradio as gr
 from PIL import Image
-import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from peft import PeftModel
-import gc
-import os
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 # --- Configuration ---
@@ -15,7 +16,6 @@ hf_token = os.getenv("token_HF")
 # --- Model Loading ---
 print("Loading base model...")
-# Added device_map="auto" and trust_remote_code
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
@@ -29,67 +29,78 @@ processor = AutoProcessor.from_pretrained(
     base_model_id,
     token=hf_token
 )
 print("Loading and applying adapter...")
-# FIX: Use the 'model' attribute specifically if Peft struggles with the wrapper
-# Or simply ensure the base model is fully loaded before wrapping
 model = PeftModel.from_pretrained(model, adapter_id)
-model.eval() # Set to evaluation mode
 print("Model loaded successfully!")
 # --- The Inference Function ---
 def perform_ocr_on_image(image_input: Image.Image) -> str:
     if image_input is None:
         return "Please upload an image."
     try:
-        # 1. Format the prompt
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image_input},
-                    {"type": "text", "text": "Analyze the input image and detect all Arabic text. Output only the extracted text—verbatim and in its original script."},
                 ],
             }
         ]
-        # 2. Apply chat template
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        # 3. Prepare inputs correctly for Qwen2.5-VL
-        # Note: Some versions require 'images' to be a list even if it's one image
         inputs = processor(text=[text], images=[image_input], padding=True, return_tensors="pt").to(model.device)
-        # 4. Generate
         with torch.no_grad():
-            # Use the underlying model's generation to avoid PEFT wrapper conflicts
             generated_ids = model.generate(**inputs, max_new_tokens=512)
-        # 5. Decode only the NEW tokens to avoid manual string splitting
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         cleaned_response = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )[0]
-        # Clean up
         gc.collect()
         torch.cuda.empty_cache()
         return cleaned_response.strip()
     except Exception as e:
         print(f"An error occurred during inference: {e}")
         return f"An error occurred: {str(e)}"
-# --- Interface ---
 demo = gr.Interface(
     fn=perform_ocr_on_image,
     inputs=gr.Image(type="pil", label="Upload Arabic Document Image"),
     outputs=gr.Textbox(label="Transcription", lines=10, show_copy_button=True),
     title="Basira: Fine-Tuned Qwen-VL for Arabic OCR",
-    description="A demo for the Qwen-VL 2.5 (3B) model, fine-tuned for enhanced Arabic OCR.",
     allow_flagging="never"
 )

+import os
+import gc
+import torch
 import gradio as gr
 from PIL import Image
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from peft import PeftModel
+# Force sync for debugging if needed
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 # --- Configuration ---
 # --- Model Loading ---
 print("Loading base model...")
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
     base_model_id,
     token=hf_token
 )
+processor.tokenizer.padding_side = "right"
 print("Loading and applying adapter...")
+# Using the direct model load to bypass the PEFT KeyError bug
 model = PeftModel.from_pretrained(model, adapter_id)
+model.eval()
 print("Model loaded successfully!")
 # --- The Inference Function ---
 def perform_ocr_on_image(image_input: Image.Image) -> str:
+    """
+    Takes a PIL image and returns the transcribed Arabic text.
+    """
     if image_input is None:
         return "Please upload an image."
     try:
+        # Format the prompt using the chat template
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image_input},
+                    {"type": "text", "text": (
+                        "Analyze the input image and detect all Arabic text. "
+                        "Output only the extracted text—verbatim and in its original script—"
+                        "without any added commentary, translation, punctuation or formatting. "
+                        "Present each line of text as plain UTF-8 strings, with no extra characters or words."
+                    )},
                 ],
             }
         ]
+        # Apply template
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        # Prepare inputs
         inputs = processor(text=[text], images=[image_input], padding=True, return_tensors="pt").to(model.device)
+        # Generate prediction
         with torch.no_grad():
             generated_ids = model.generate(**inputs, max_new_tokens=512)
+        # Trim the input tokens from the output to get only the response
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         cleaned_response = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
         )[0]
+        # Clean up memory
         gc.collect()
         torch.cuda.empty_cache()
         return cleaned_response.strip()
     except Exception as e:
         print(f"An error occurred during inference: {e}")
         return f"An error occurred: {str(e)}"
+# --- Create and Launch the Gradio Interface ---
 demo = gr.Interface(
     fn=perform_ocr_on_image,
     inputs=gr.Image(type="pil", label="Upload Arabic Document Image"),
     outputs=gr.Textbox(label="Transcription", lines=10, show_copy_button=True),
     title="Basira: Fine-Tuned Qwen-VL for Arabic OCR",
+    description="A demo for the Qwen-VL 2.5 (3B) model, fine-tuned for enhanced Arabic OCR. Upload an image to see the transcription.",
     allow_flagging="never"
 )