import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import torch # Model load (প্রথমবার slow, cache হয়ে যাবে) model_id = "vikhyatk/moondream2" # অথবা "moondream/moondream3-preview" try করো revision = "2025-06-21" # latest stable চেক করো HF page-এ model = AutoModelForCausalLM.from_pretrained( model_id, revision=revision, trust_remote_code=True, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True) def ocr_image(image, prompt="Extract all text from this image accurately."): if image is None: return "দয়া করে ছবি আপলোড করুন।" # Moondream-এ image + text prompt দিয়ে generate enc_image = model.encode_image(image) generated_ids = model.generate( **tokenizer(prompt, return_tensors="pt").to(model.device), image_embeds=enc_image.to(model.device), max_new_tokens=512, do_sample=False ) generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) return generated_text if generated_text else "কোনো টেক্সট পাওয়া যায়নি।" demo = gr.Interface( fn=ocr_image, inputs=[gr.Image(type="pil"), gr.Textbox(label="Custom Prompt (optional)", value="Extract all text from this image accurately.")], outputs="text", title="Moondream OCR - Any Language Try", description="Moondream দিয়ে ছবি থেকে টেক্সট extract করুন। Prompt customize করতে পারেন (e.g., Bangla text চাইলে 'Extract Bangla text' বলুন)।" ) demo.launch()