BanglaScreenReader

Sleeping

App Files Files Community

kj03 commited on Jun 20, 2025

Commit

c31f5a5

verified ·

1 Parent(s): 001ba9b

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -106

app.py CHANGED Viewed

@@ -3,24 +3,16 @@ from PIL import Image
 from gtts import gTTS
 import pytesseract
 import tempfile
-import torch
-from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
-import re
-import torchaudio
-# Set Tesseract path explicitly (for OCR)
 pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
-# --- OCR Module (Existing) ---
 def bangla_reader(image):
     if image is None:
         return "কোনো ছবি দেওয়া হয়নি।", None
     # OCR with Bengali support
-    try:
-        ocr_text = pytesseract.image_to_string(image, lang='ben')
-    except Exception as e:
-        return f"OCR ত্রুটি: {str(e)}", None
     if not ocr_text.strip():
         return "কোনো লেখা সনাক্ত করা যায়নি।", None
@@ -36,103 +28,18 @@ def bangla_reader(image):
     return f"OCR ফলাফল:\n{ocr_text.strip()}", audio_path
-# --- Voice-to-Form Module (New) ---
-# Load models only once
-if not torch.cuda.is_available():
-    device = "cpu"
-else:
-    device = 0 if torch.cuda.is_available() else "cpu"
-# Initialize speech recognition pipeline
-asr_pipe = pipeline(
-    "automatic-speech-recognition",
-    model="facebook/wav2vec2-large-xlsr-53-bengali",
-    device=device
 )
-# Initialize form extraction model (smaller model for Hugging Face Spaces)
-form_model_name = "csebuetnlp/banglat5_nmt_en_bn"  # Alternative: "google/flan-t5-small"
-form_tokenizer = AutoTokenizer.from_pretrained(form_model_name)
-form_model = AutoModelForSeq2SeqLM.from_pretrained(form_model_name)
-def extract_form_info(text):
-    """Extract name, age, and address from Bangla text using NLU"""
-    prompt = (
-        "নিচের বাক্য থেকে নাম, বয়স এবং ঠিকানা বের করুন। যদি কোনো তথ্য না থাকে, 'N/A' লিখুন:\n"
-        f"{text}\n\n"
-        "নাম: \nবয়স: \nঠিকানা:"
-    )
-    inputs = form_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
-    outputs = form_model.generate(**inputs, max_length=150)
-    response = form_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Parse response
-    name = re.search(r'নাম:\s*(.*?)(\n|$)', response)
-    age = re.search(r'বয়স:\s*(.*?)(\n|$)', response)
-    address = re.search(r'ঠিকানা:\s*(.*?)(\n|$)', response)
-    return {
-        "name": name.group(1).strip() if name else "N/A",
-        "age": age.group(1).strip() if age else "N/A",
-        "address": address.group(1).strip() if address else "N/A"
-    }
-def voice_to_form(audio_path):
-    """Process audio input to fill form"""
-    if not audio_path:
-        return {
-            "name": "অডিও দেওয়া হয়নি",
-            "age": "",
-            "address": ""
-        }
-    try:
-        # Convert speech to text
-        text = asr_pipe(audio_path)["text"]
-        # Extract form information
-        form_data = extract_form_info(text)
-        return form_data
-    except Exception as e:
-        return {
-            "name": f"ত্রুটি: {str(e)}",
-            "age": "",
-            "address": ""
-        }
-# --- Gradio Interface ---
-with gr.Blocks(title="বাংলা সহায়ক") as demo:
-    gr.Markdown("# 📖 বাংলা সহায়ক (Bangla Assistant)")
-    with gr.Tab("ছবি থেকে পাঠ্য ও পাঠ (Image Reader)"):
-        gr.Markdown("## ছবির বাংলা লেখা পাঠ্য ও কণ্ঠে রূপান্তর করুন")
-        image_input = gr.Image(type="pil", label="বাংলা লেখা সম্বলিত ছবি দিন")
-        text_output = gr.Textbox(label="OCR ফলাফল")
-        audio_output = gr.Audio(label="বাংলা কণ্ঠে শুনুন")
-        image_btn = gr.Button("প্রক্রিয়া করুন")
-        image_btn.click(bangla_reader, inputs=image_input, outputs=[text_output, audio_output])
-    with gr.Tab("কণ্ঠ থেকে ফর্ম (Voice Form Filler)"):
-        gr.Markdown("## বাংলা কথার মাধ্যমে ফর্ম পূরণ করুন")
-        gr.Markdown("উদাহরণ: \"আমার নাম রুবেল ইসলাম, বয়স ৩৫ বছর, ঠিকানা খুলনা সদর\"")
-        audio_input = gr.Audio(source="microphone", type="filepath", label="বাংলায় কথা বলুন")
-        form_btn = gr.Button("ফর্ম পূরণ করুন")
-        with gr.Row():
-            with gr.Column():
-                name_out = gr.Textbox(label="নাম")
-            with gr.Column():
-                age_out = gr.Textbox(label="বয়স")
-        address_out = gr.Textbox(label="ঠিকানা")
-        form_btn.click(
-            voice_to_form,
-            inputs=audio_input,
-            outputs=[name_out, age_out, address_out]
-        )
 if __name__ == "__main__":
     demo.launch()

 from gtts import gTTS
 import pytesseract
 import tempfile
+# FIX: Set Tesseract path explicitly
 pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
 def bangla_reader(image):
     if image is None:
         return "কোনো ছবি দেওয়া হয়নি।", None
     # OCR with Bengali support
+    ocr_text = pytesseract.image_to_string(image, lang='ben')
     if not ocr_text.strip():
         return "কোনো লেখা সনাক্ত করা যায়নি।", None
     return f"OCR ফলাফল:\n{ocr_text.strip()}", audio_path
+# Gradio UI
+demo = gr.Interface(
+    fn=bangla_reader,
+    inputs=gr.Image(type="pil", label="বাংলা লেখা সম্বলিত ছবি দিন"),
+    outputs=[
+        gr.Textbox(label="OCR ফলাফল"),
+        gr.Audio(label="বাংলা কণ্ঠে শুনুন")
+    ],
+    title="📖 বাংলা রিডার (Bangla Reader)",
+    description="ছবির বাংলা লেখা পড়ে তা পাঠ্য ও কণ্ঠে রূপান্তর করে শোনায়।",
+    allow_flagging="never"
 )
 if __name__ == "__main__":
     demo.launch()