|
|
import gradio as gr |
|
|
from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM |
|
|
import torch |
|
|
import unicodedata |
|
|
import re |
|
|
import whisper |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
import nltk |
|
|
nltk.download('punkt') |
|
|
from nltk.tokenize import sent_tokenize |
|
|
|
|
|
import fitz |
|
|
import docx |
|
|
from bs4 import BeautifulSoup |
|
|
import markdown2 |
|
|
import chardet |
|
|
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
|
|
|
translator = None |
|
|
whisper_model = None |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
def load_darija_model(): |
|
|
global translator |
|
|
if translator is None: |
|
|
model_name = "LocaleNLP/eng_wolof" |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=HF_TOKEN).to(device) |
|
|
tokenizer = MarianTokenizer.from_pretrained(model_name, token=HF_TOKEN) |
|
|
translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1) |
|
|
return translator |
|
|
|
|
|
def load_whisper_model(): |
|
|
global whisper_model |
|
|
if whisper_model is None: |
|
|
whisper_model = whisper.load_model("base") |
|
|
return whisper_model |
|
|
|
|
|
def transcribe_audio(audio_file): |
|
|
model = load_whisper_model() |
|
|
if isinstance(audio_file, str): |
|
|
audio_path = audio_file |
|
|
else: |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
|
|
tmp.write(audio_file.read()) |
|
|
audio_path = tmp.name |
|
|
result = model.transcribe(audio_path) |
|
|
if not isinstance(audio_file, str): |
|
|
os.remove(audio_path) |
|
|
return result["text"] |
|
|
|
|
|
def extract_text_from_file(uploaded_file): |
|
|
|
|
|
if isinstance(uploaded_file, str): |
|
|
file_path = uploaded_file |
|
|
file_type = file_path.split('.')[-1].lower() |
|
|
with open(file_path, "rb") as f: |
|
|
content = f.read() |
|
|
else: |
|
|
file_type = uploaded_file.name.split('.')[-1].lower() |
|
|
content = uploaded_file.read() |
|
|
|
|
|
if file_type == "pdf": |
|
|
with fitz.open(stream=content, filetype="pdf") as doc: |
|
|
return "\n".join([page.get_text() for page in doc]) |
|
|
elif file_type == "docx": |
|
|
if isinstance(uploaded_file, str): |
|
|
doc = docx.Document(file_path) |
|
|
else: |
|
|
doc = docx.Document(uploaded_file) |
|
|
return "\n".join([para.text for para in doc.paragraphs]) |
|
|
else: |
|
|
encoding = chardet.detect(content)['encoding'] |
|
|
if encoding: |
|
|
content = content.decode(encoding, errors='ignore') |
|
|
if file_type in ("html", "htm"): |
|
|
soup = BeautifulSoup(content, "html.parser") |
|
|
return soup.get_text() |
|
|
elif file_type == "md": |
|
|
html = markdown2.markdown(content) |
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
return soup.get_text() |
|
|
elif file_type == "srt": |
|
|
return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content) |
|
|
elif file_type in ("txt", "text"): |
|
|
return content |
|
|
else: |
|
|
raise ValueError("Unsupported file type") |
|
|
|
|
|
def translate(text): |
|
|
translator = load_darija_model() |
|
|
lang_tag = ">>wol<<" |
|
|
|
|
|
paragraphs = text.split("\n") |
|
|
translated_output = [] |
|
|
|
|
|
with torch.no_grad(): |
|
|
for para in paragraphs: |
|
|
if not para.strip(): |
|
|
translated_output.append("") |
|
|
continue |
|
|
sentences = [s.strip() for s in para.split('. ') if s.strip()] |
|
|
formatted = [f"{lang_tag} {s}" for s in sentences] |
|
|
|
|
|
results = translator(formatted, |
|
|
max_length=5000, |
|
|
num_beams=5, |
|
|
early_stopping=True, |
|
|
no_repeat_ngram_size=3, |
|
|
repetition_penalty=1.5, |
|
|
length_penalty=1.2) |
|
|
translated_sentences = [r['translation_text'].capitalize() for r in results] |
|
|
translated_output.append('. '.join(translated_sentences)) |
|
|
|
|
|
return "\n".join(translated_output) |
|
|
|
|
|
def process_input(input_mode, text, audio_file, file_obj): |
|
|
input_text = "" |
|
|
if input_mode == "Text": |
|
|
input_text = text |
|
|
elif input_mode == "Audio": |
|
|
if audio_file is not None: |
|
|
input_text = transcribe_audio(audio_file) |
|
|
elif input_mode == "File": |
|
|
if file_obj is not None: |
|
|
input_text = extract_text_from_file(file_obj) |
|
|
return input_text |
|
|
|
|
|
def translate_and_return(text): |
|
|
if not text.strip(): |
|
|
return "No input text to translate." |
|
|
return translate(text) |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("## LocaleNLP English-to-Wolof Translator") |
|
|
gr.Markdown("Upload English text, audio, or document to translate to Wolof using Localenlp model.") |
|
|
|
|
|
with gr.Row(): |
|
|
input_mode = gr.Radio(choices=["Text", "Audio", "File"], label="Select input mode", value="Text") |
|
|
|
|
|
input_text = gr.Textbox(label="Enter English text", lines=10, visible=True) |
|
|
audio_input = gr.Audio(label="Upload audio (.wav, .mp3, .m4a)", type="filepath", visible=False) |
|
|
file_input = gr.File(file_types=['.pdf', '.docx', '.html', '.htm', '.md', '.srt', '.txt'], label="Upload document", visible=False) |
|
|
|
|
|
extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False) |
|
|
translate_button = gr.Button("Translate to Wolof") |
|
|
output_text = gr.Textbox(label="Translated Wolof Text", lines=10, interactive=False) |
|
|
|
|
|
def update_visibility(mode): |
|
|
return { |
|
|
input_text: gr.update(visible=(mode=="Text")), |
|
|
audio_input: gr.update(visible=(mode=="Audio")), |
|
|
file_input: gr.update(visible=(mode=="File")), |
|
|
extracted_text: gr.update(value="", visible=True), |
|
|
output_text: gr.update(value="") |
|
|
} |
|
|
|
|
|
input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text]) |
|
|
|
|
|
def handle_process(mode, text, audio, file_obj): |
|
|
try: |
|
|
extracted = process_input(mode, text, audio, file_obj) |
|
|
return extracted, "" |
|
|
except Exception as e: |
|
|
return "", f"Error: {str(e)}" |
|
|
|
|
|
translate_button.click(fn=handle_process, inputs=[input_mode, input_text, audio_input, file_input], outputs=[extracted_text, output_text]) |
|
|
|
|
|
def handle_translate(text): |
|
|
return translate_and_return(text) |
|
|
|
|
|
translate_button.click(fn=handle_translate, inputs=extracted_text, outputs=output_text) |
|
|
|
|
|
demo.launch() |
|
|
|