kj03 commited on
Commit
3c9b1a8
·
verified ·
1 Parent(s): c7df4ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -26
app.py CHANGED
@@ -1,38 +1,43 @@
1
  import gradio as gr
2
- from transformers import DetrImageProcessor, DetrForObjectDetection
 
3
  import torch
4
- from PIL import Image, ImageDraw
 
5
 
6
- # Load pre-trained model and processor
7
- processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
8
- model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
9
 
10
- # Object detection function
11
- def detect_objects(image):
12
- # Convert image and run model
13
- inputs = processor(images=image, return_tensors="pt")
14
- outputs = model(**inputs)
15
 
16
- # Get outputs
17
- target_sizes = torch.tensor([image.size[::-1]]) # PIL: (W, H) -> expected (H, W)
18
- results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
19
 
20
- # Draw boxes on the image
21
- draw = ImageDraw.Draw(image)
22
- for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
23
- box = [round(i, 2) for i in box.tolist()]
24
- draw.rectangle(box, outline="red", width=3)
25
- draw.text((box[0], box[1]), f"{model.config.id2label[label.item()]}: {round(score.item(), 3)}", fill="red")
26
 
27
- return image
 
 
 
28
 
29
- # Launch Gradio interface
 
 
30
  demo = gr.Interface(
31
- fn=detect_objects,
32
- inputs=gr.Image(source="camera", tool="editor", live=True),
33
- outputs=gr.Image(type="pil"),
34
- title="Real-Time Object Detection",
35
- description="Open webcam and detect objects using facebook/detr-resnet-50"
 
 
 
36
  )
37
 
38
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
3
+ from PIL import Image
4
  import torch
5
+ from TTS.api import TTS
6
+ import tempfile
7
 
8
+ # Load OCR model
9
+ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
10
+ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
11
 
12
+ # Load multilingual TTS model (supports Bangla)
13
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
 
 
 
14
 
15
+ def bangla_reader(image):
16
+ if image is None:
17
+ return "কোনো ছবি পাওয়া যায়নি।", None
18
 
19
+ # OCR
20
+ pixel_values = processor(images=image, return_tensors="pt").pixel_values
21
+ generated_ids = model.generate(pixel_values)
22
+ ocr_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
23
 
24
+ # TTS
25
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
26
+ tts.tts_to_file(text=ocr_text, file_path=tmp.name, language="bn", speaker="female")
27
+ audio_path = tmp.name
28
 
29
+ return f"OCR ফলাফল: {ocr_text}", audio_path
30
+
31
+ # Gradio UI
32
  demo = gr.Interface(
33
+ fn=bangla_reader,
34
+ inputs=gr.Image(type="pil", label="বাংলা লেখা সম্বলিত ছবি দিন"),
35
+ outputs=[
36
+ gr.Textbox(label="OCR ফলাফল"),
37
+ gr.Audio(label="বাংলা কণ্ঠে পাঠ করুন")
38
+ ],
39
+ title="📖 বাংলা রিডার",
40
+ description="ছবির বাংলা লেখা পড়ে তা কণ্ঠে রূপান্তর করে শোনায়।"
41
  )
42
 
43
  if __name__ == "__main__":