Spaces:

diabolic6045
/

Sanskrit-Qwen2.5-VL-7B-Instruct-OCR

Sleeping

App Files Files Community

diabolic6045 commited on Sep 8

Commit

e22a631

verified ·

1 Parent(s): 7c5d7b1

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -132

app.py CHANGED Viewed

@@ -20,139 +20,44 @@ import spaces
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class SanskritTranscriptionModel:
-    def __init__(self, model_path: str, adapter_path: str = None):
-        """Initialize the model and processor"""
-        self.model_path = model_path
-        self.adapter_path = adapter_path
-        self.model = None
-        self.processor = None
-        self.is_loaded = False
-    def load_model(self):
-        """Load the model and processor"""
-        if self.is_loaded:
-            return
-        try:
-            logger.info("Loading processor...")
-            self.processor = AutoProcessor.from_pretrained(self.model_path)
-            logger.info("Loading base model...")
-            # Check if CUDA is available, otherwise use CPU
-            device_map = "auto" if torch.cuda.is_available() else "cpu"
-            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                self.model_path,
-                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-                device_map=device_map
-            )
-            if self.adapter_path and os.path.exists(self.adapter_path):
-                logger.info("Loading LoRA adapters...")
-                self.model = PeftModel.from_pretrained(self.model, self.adapter_path)
-            else:
-                logger.info("No adapter path found, using base model only")
-            self.model.eval()
-            device = next(self.model.parameters()).device
-            logger.info(f"Model loaded on device: {device}")
-            self.is_loaded = True
-        except Exception as e:
-            logger.error(f"Error loading model: {e}")
-            raise e
-    def transcribe_image(self, image: Image.Image, prompt: str = None) -> str:
-        """Transcribe Sanskrit text from image"""
-        if not self.is_loaded:
-            self.load_model()
-        if prompt is None:
-            prompt = "Please transcribe the Sanskrit text shown in this image:"
-        try:
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image", "image": image},
-                        {"type": "text", "text": prompt}
-                    ]
-                }
-            ]
-            # Preparation for inference
-            text = self.processor.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True
-            )
-            image_inputs, video_inputs = process_vision_info(messages)
-            inputs = self.processor(
-                text=[text],
-                images=image_inputs,
-                videos=video_inputs,
-                padding=True,
-                return_tensors="pt",
-            )
-            # Get model device and move inputs there
-            model_device = next(self.model.parameters()).device
-            inputs = {k: v.to(model_device) for k, v in inputs.items()}
-            with torch.no_grad():
-                generated_ids = self.model.generate(
-                    **inputs,
-                    max_new_tokens=512,
-                    do_sample=False,
-                    pad_token_id=self.processor.tokenizer.eos_token_id,
-                    use_cache=True,
-                    repetition_penalty=1.1
-                )
-            # Extract only the generated part
-            generated_ids_trimmed = [
-                out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
-            ]
-            output_text = self.processor.batch_decode(
-                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-            )
-            return output_text[0] if output_text else ""
-        except Exception as e:
-            logger.error(f"Error generating response: {e}")
-            return f"Error: {str(e)}"
-# Initialize the model
-model_instance = None
-@spaces.GPU(duration=60)  # 2 minutes for model loading
-def initialize_model():
-    """Initialize the model instance with ZeroGPU support"""
-    global model_instance
-    if model_instance is None:
-        try:
-            model_path = 'Qwen/Qwen2.5-VL-7B-Instruct'
-            adapter_path = './outputs/out-qwen2-5-vl'
-            model_instance = SanskritTranscriptionModel(model_path, adapter_path)
-            # Load the model immediately during initialization
-            model_instance.load_model()
-            return "✅ Model loaded and ready"
-        except Exception as e:
-            logger.error(f"Error initializing model: {e}")
-            return f"❌ Model initialization failed: {str(e)}"
-    return "✅ Model already loaded and ready"
 def check_model_status():
     """Check if model is loaded and ready"""
     try:
-        global model_instance
-        if model_instance is not None and model_instance.is_loaded:
             return "✅ Model loaded and ready"
         else:
             return "⏳ Model not loaded yet"
     except Exception as e:
         return f"❌ Model error: {str(e)}"
 def transcribe_sanskrit(image, custom_prompt, progress=gr.Progress()):
     """Gradio interface function for transcription using pre-loaded model"""
     if image is None:
@@ -160,19 +65,59 @@ def transcribe_sanskrit(image, custom_prompt, progress=gr.Progress()):
     try:
         progress(0.1, desc="Processing image...")
-        # Use the pre-loaded model instance
-        global model_instance
-        if model_instance is None or not model_instance.is_loaded:
-            return "❌ Model not loaded. Please wait for the model to initialize or refresh the page."
         # Use custom prompt if provided, otherwise use default
         prompt = custom_prompt if custom_prompt.strip() else "Please transcribe the Sanskrit text shown in this image:"
         progress(0.5, desc="Generating transcription...")
-        result = model_instance.transcribe_image(image, prompt)
         progress(1.0, desc="Complete!")
-        return result
     except Exception as e:
         logger.error(f"Error in transcribe_sanskrit: {e}")
@@ -253,9 +198,6 @@ def create_gradio_interface():
                 - High accuracy transcription
                 """)
-        # Example section
-        with gr.Row():
-            gr.Markdown("### Example Images")
         # Event handlers
         transcribe_btn.click(
@@ -279,9 +221,9 @@ def create_gradio_interface():
             outputs=model_status
         )
-        # Initialize model and check status on app load
         app.load(
-            fn=initialize_model,
             outputs=model_status
         )

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Load model at module level (global scope)
+model_path = 'Qwen/Qwen2.5-VL-7B-Instruct'
+adapter_path = './outputs/out-qwen2-5-vl'
+logger.info("Loading processor...")
+processor = AutoProcessor.from_pretrained(model_path)
+logger.info("Loading base model...")
+# Check if CUDA is available, otherwise use CPU
+device_map = "auto" if torch.cuda.is_available() else "cpu"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+    device_map=device_map
+)
+if adapter_path and os.path.exists(adapter_path):
+    logger.info("Loading LoRA adapters...")
+    model = PeftModel.from_pretrained(model, adapter_path)
+else:
+    logger.info("No adapter path found, using base model only")
+model.eval()
+device = next(model.parameters()).device
+logger.info(f"Model loaded on device: {device}")
 def check_model_status():
     """Check if model is loaded and ready"""
     try:
+        if model is not None and processor is not None:
             return "✅ Model loaded and ready"
         else:
             return "⏳ Model not loaded yet"
     except Exception as e:
         return f"❌ Model error: {str(e)}"
+@spaces.GPU
 def transcribe_sanskrit(image, custom_prompt, progress=gr.Progress()):
     """Gradio interface function for transcription using pre-loaded model"""
     if image is None:
     try:
         progress(0.1, desc="Processing image...")
         # Use custom prompt if provided, otherwise use default
         prompt = custom_prompt if custom_prompt.strip() else "Please transcribe the Sanskrit text shown in this image:"
+        # Format the conversation using chat template
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt}
+                ]
+            }
+        ]
+        # Preparation for inference
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        # Get model device and move inputs there
+        model_device = next(model.parameters()).device
+        inputs = {k: v.to(model_device) for k, v in inputs.items()}
         progress(0.5, desc="Generating transcription...")
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=False,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                use_cache=True,
+                repetition_penalty=1.1
+            )
+        # Extract only the generated part
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
         progress(1.0, desc="Complete!")
+        return output_text[0] if output_text else ""
     except Exception as e:
         logger.error(f"Error in transcribe_sanskrit: {e}")
                 - High accuracy transcription
                 """)
         # Event handlers
         transcribe_btn.click(
             outputs=model_status
         )
+        # Check model status on app load
         app.load(
+            fn=check_model_status,
             outputs=model_status
         )