Spaces:

A-M-R-A-G
/

Basira-Qwen-based

Runtime error

App Files Files Community

A-M-R-A-G commited on Feb 25

Commit

28e990c

verified ·

1 Parent(s): 53a6054

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -40

app.py CHANGED Viewed

@@ -6,99 +6,90 @@ from peft import PeftModel
 import gc
 import os
-# Add this line immediately after your imports
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 # --- Configuration ---
 base_model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
 adapter_id = "A-M-R-A-G/Basira"
 # --- Model Loading ---
 print("Loading base model...")
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
     device_map="auto",
-    token=os.getenv("token_HF")
 )
 print("Loading processor...")
 processor = AutoProcessor.from_pretrained(
     base_model_id,
-    token=os.getenv("token_HF")
 )
-processor.tokenizer.padding_side = "right"
 print("Loading and applying adapter...")
 model = PeftModel.from_pretrained(model, adapter_id)
 print("Model loaded successfully!")
 # --- The Inference Function ---
 def perform_ocr_on_image(image_input: Image.Image) -> str:
-    """
-    This is the core function that Gradio will call.
-    It takes a PIL image and returns the transcribed text string.
-    """
     if image_input is None:
         return "Please upload an image."
     try:
-        # Format the prompt using the chat template
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image_input},
-                    {"type": "text", "text": (
-                        "Analyze the input image and detect all Arabic text. "
-                        "Output only the extracted text—verbatim and in its original script—"
-                        "without any added commentary, translation, punctuation or formatting. "
-                        "Present each line of text as plain UTF-8 strings, with no extra characters or words."
-                    )},
                 ],
             }
         ]
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        # Prepare inputs for the model
-        inputs = processor(text=text, images=image_input, return_tensors="pt").to(model.device)
-        # Generate prediction
         with torch.no_grad():
             generated_ids = model.generate(**inputs, max_new_tokens=512)
-        # Decode the output
-        full_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # --- FIX: Post-process the response to remove the prompt ---
-        # The model's actual output starts after the "assistant" marker.
-        # We split the full response by this marker and take the last part.
-        parts = full_response.split("assistant")
-        if len(parts) > 1:
-            # Take the last part and remove any leading/trailing whitespace
-            cleaned_response = parts[-1].strip()
-        else:
-            # If the marker isn't found, return the full response as a fallback
-            cleaned_response = full_response
-        # --- END OF FIX ---
-        # Clean up memory
         gc.collect()
         torch.cuda.empty_cache()
-        return cleaned_response
     except Exception as e:
         print(f"An error occurred during inference: {e}")
         return f"An error occurred: {str(e)}"
-# --- Create and Launch the Gradio Interface ---
 demo = gr.Interface(
     fn=perform_ocr_on_image,
     inputs=gr.Image(type="pil", label="Upload Arabic Document Image"),
     outputs=gr.Textbox(label="Transcription", lines=10, show_copy_button=True),
     title="Basira: Fine-Tuned Qwen-VL for Arabic OCR",
-    description="A demo for the Qwen-VL 2.5 (3B) model, fine-tuned for enhanced Arabic OCR. Upload an image to see the transcription.",
     allow_flagging="never"
 )

 import gc
 import os
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 # --- Configuration ---
 base_model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
 adapter_id = "A-M-R-A-G/Basira"
+hf_token = os.getenv("token_HF")
 # --- Model Loading ---
 print("Loading base model...")
+# Added device_map="auto" and trust_remote_code
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     base_model_id,
     torch_dtype=torch.float16,
     device_map="auto",
+    trust_remote_code=True,
+    token=hf_token
 )
 print("Loading processor...")
 processor = AutoProcessor.from_pretrained(
     base_model_id,
+    token=hf_token
 )
 print("Loading and applying adapter...")
+# FIX: Use the 'model' attribute specifically if Peft struggles with the wrapper
+# Or simply ensure the base model is fully loaded before wrapping
 model = PeftModel.from_pretrained(model, adapter_id)
+model.eval() # Set to evaluation mode
 print("Model loaded successfully!")
 # --- The Inference Function ---
 def perform_ocr_on_image(image_input: Image.Image) -> str:
     if image_input is None:
         return "Please upload an image."
     try:
+        # 1. Format the prompt
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image_input},
+                    {"type": "text", "text": "Analyze the input image and detect all Arabic text. Output only the extracted text—verbatim and in its original script."},
                 ],
             }
         ]
+        # 2. Apply chat template
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        # 3. Prepare inputs correctly for Qwen2.5-VL
+        # Note: Some versions require 'images' to be a list even if it's one image
+        inputs = processor(text=[text], images=[image_input], padding=True, return_tensors="pt").to(model.device)
+        # 4. Generate
         with torch.no_grad():
+            # Use the underlying model's generation to avoid PEFT wrapper conflicts
             generated_ids = model.generate(**inputs, max_new_tokens=512)
+        # 5. Decode only the NEW tokens to avoid manual string splitting
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        cleaned_response = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        # Clean up
         gc.collect()
         torch.cuda.empty_cache()
+        return cleaned_response.strip()
     except Exception as e:
         print(f"An error occurred during inference: {e}")
         return f"An error occurred: {str(e)}"
+# --- Interface ---
 demo = gr.Interface(
     fn=perform_ocr_on_image,
     inputs=gr.Image(type="pil", label="Upload Arabic Document Image"),
     outputs=gr.Textbox(label="Transcription", lines=10, show_copy_button=True),
     title="Basira: Fine-Tuned Qwen-VL for Arabic OCR",
+    description="A demo for the Qwen-VL 2.5 (3B) model, fine-tuned for enhanced Arabic OCR.",
     allow_flagging="never"
 )