kj03 commited on
Commit
1213370
·
verified ·
1 Parent(s): 90d94e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -19
app.py CHANGED
@@ -1,37 +1,41 @@
1
  import gradio as gr
2
- from transformers import TrOCRProcessor, VisionEncoderDecoderModel
3
  from PIL import Image
4
  import torch
5
- from TTS.api import TTS
 
 
6
  import tempfile
7
 
8
- # Load OCR model
9
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
10
- model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
11
 
12
- # Load multilingual TTS model (supports Bangla)
13
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
 
 
 
14
 
15
  def bangla_reader(image):
16
  if image is None:
17
- return "কোনো ছবি পাওয়া যায়নি।", None
18
 
19
- # OCR
20
  pixel_values = processor(images=image, return_tensors="pt").pixel_values
21
- generated_ids = model.generate(pixel_values)
22
  ocr_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
23
 
24
- # Choose speaker safely
25
- available_speakers = tts.speakers
26
- speaker_id = available_speakers[0] if available_speakers else None
27
 
28
- # TTS
29
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
30
- tts.tts_to_file(text=ocr_text, file_path=tmp.name, language="bn", speaker=speaker_id)
31
  audio_path = tmp.name
32
 
33
- return f"OCR ফলাফল: {ocr_text}", audio_path
34
-
35
  # Gradio UI
36
  demo = gr.Interface(
37
  fn=bangla_reader,
@@ -40,8 +44,8 @@ demo = gr.Interface(
40
  gr.Textbox(label="OCR ফলাফল"),
41
  gr.Audio(label="বাংলা কণ্ঠে পাঠ করুন")
42
  ],
43
- title="📖 বাংলা রিডার",
44
- description="ছবির বাংলা লেখা পড়ে তা কণ্ঠে রূপান্তর করে শোনায়।"
45
  )
46
 
47
  if __name__ == "__main__":
 
1
  import gradio as gr
 
2
  from PIL import Image
3
  import torch
4
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
5
+ from espnet2.bin.tts_inference import Text2Speech
6
+ import soundfile as sf
7
  import tempfile
8
 
9
+ # Load OCR model (TrOCR base)
10
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
11
+ ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
12
 
13
+ # Load Bangla TTS model from ESPnet
14
+ tts_model = Text2Speech.from_pretrained(
15
+ model_tag="kan-bayashi/bengali_female",
16
+ device="cpu"
17
+ )
18
 
19
  def bangla_reader(image):
20
  if image is None:
21
+ return "কোনো ছবি দেওয়া হয়নি।", None
22
 
23
+ # Step 1: OCR
24
  pixel_values = processor(images=image, return_tensors="pt").pixel_values
25
+ generated_ids = ocr_model.generate(pixel_values)
26
  ocr_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
27
 
28
+ # Step 2: Bangla TTS
29
+ with torch.no_grad():
30
+ wav_output = tts_model(ocr_text)["wav"]
31
 
32
+ # Save to temporary file
33
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
34
+ sf.write(tmp.name, wav_output.view(-1).cpu().numpy(), 22050)
35
  audio_path = tmp.name
36
 
37
+ return f"OCR ফলাফল:\n{ocr_text}", audio_path
38
+
39
  # Gradio UI
40
  demo = gr.Interface(
41
  fn=bangla_reader,
 
44
  gr.Textbox(label="OCR ফলাফল"),
45
  gr.Audio(label="বাংলা কণ্ঠে পাঠ করুন")
46
  ],
47
+ title="📖 বাংলা রিডার (Bangla Reader)",
48
+ description="ছবির বাংলা লেখা পড়ে তা পাঠ্য ও কণ্ঠে রূপান্তর করে শোনায়।"
49
  )
50
 
51
  if __name__ == "__main__":