Spaces:

LocaleNLP
/

eng_wol

Sleeping

App Files Files Community

eng_wol / app.py

Mgolo

Update app.py

fa7ab8f verified 5 months ago

raw

history blame contribute delete

6.6 kB

	import gradio as gr
	from transformers import pipeline, MarianTokenizer, AutoModelForSeq2SeqLM
	import torch
	import unicodedata
	import re
	import whisper
	import tempfile
	import os

	import nltk
	nltk.download('punkt')
	from nltk.tokenize import sent_tokenize

	import fitz # PyMuPDF
	import docx
	from bs4 import BeautifulSoup
	import markdown2
	import chardet

	# Device setup
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Load Wolof MarianMT model from HF hub (cached manually)
	translator = None
	whisper_model = None

	HF_TOKEN = os.getenv("HF_TOKEN")
	def load_darija_model():
	global translator
	if translator is None:
	model_name = "LocaleNLP/eng_wolof"
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=HF_TOKEN).to(device)
	tokenizer = MarianTokenizer.from_pretrained(model_name, token=HF_TOKEN)
	translator = pipeline("translation", model=model, tokenizer=tokenizer, device=0 if device.type == 'cuda' else -1)
	return translator

	def load_whisper_model():
	global whisper_model
	if whisper_model is None:
	whisper_model = whisper.load_model("base")
	return whisper_model

	def transcribe_audio(audio_file):
	model = load_whisper_model()
	if isinstance(audio_file, str):
	audio_path = audio_file
	else:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	tmp.write(audio_file.read())
	audio_path = tmp.name
	result = model.transcribe(audio_path)
	if not isinstance(audio_file, str):
	os.remove(audio_path)
	return result["text"]

	def extract_text_from_file(uploaded_file):
	# Handle both filepath (str) and file-like object
	if isinstance(uploaded_file, str):
	file_path = uploaded_file
	file_type = file_path.split('.')[-1].lower()
	with open(file_path, "rb") as f:
	content = f.read()
	else:
	file_type = uploaded_file.name.split('.')[-1].lower()
	content = uploaded_file.read()

	if file_type == "pdf":
	with fitz.open(stream=content, filetype="pdf") as doc:
	return "\n".join([page.get_text() for page in doc])
	elif file_type == "docx":
	if isinstance(uploaded_file, str):
	doc = docx.Document(file_path)
	else:
	doc = docx.Document(uploaded_file)
	return "\n".join([para.text for para in doc.paragraphs])
	else:
	encoding = chardet.detect(content)['encoding']
	if encoding:
	content = content.decode(encoding, errors='ignore')
	if file_type in ("html", "htm"):
	soup = BeautifulSoup(content, "html.parser")
	return soup.get_text()
	elif file_type == "md":
	html = markdown2.markdown(content)
	soup = BeautifulSoup(html, "html.parser")
	return soup.get_text()
	elif file_type == "srt":
	return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", content)
	elif file_type in ("txt", "text"):
	return content
	else:
	raise ValueError("Unsupported file type")

	def translate(text):
	translator = load_darija_model()
	lang_tag = ">>wol<<"

	paragraphs = text.split("\n")
	translated_output = []

	with torch.no_grad():
	for para in paragraphs:
	if not para.strip():
	translated_output.append("")
	continue
	sentences = [s.strip() for s in para.split('. ') if s.strip()]
	formatted = [f"{lang_tag} {s}" for s in sentences]

	results = translator(formatted,
	max_length=5000,
	num_beams=5,
	early_stopping=True,
	no_repeat_ngram_size=3,
	repetition_penalty=1.5,
	length_penalty=1.2)
	translated_sentences = [r['translation_text'].capitalize() for r in results]
	translated_output.append('. '.join(translated_sentences))

	return "\n".join(translated_output)

	def process_input(input_mode, text, audio_file, file_obj):
	input_text = ""
	if input_mode == "Text":
	input_text = text
	elif input_mode == "Audio":
	if audio_file is not None:
	input_text = transcribe_audio(audio_file)
	elif input_mode == "File":
	if file_obj is not None:
	input_text = extract_text_from_file(file_obj)
	return input_text

	def translate_and_return(text):
	if not text.strip():
	return "No input text to translate."
	return translate(text)

	# Gradio UI components
	with gr.Blocks() as demo:
	gr.Markdown("## LocaleNLP English-to-Wolof Translator")
	gr.Markdown("Upload English text, audio, or document to translate to Wolof using Localenlp model.")

	with gr.Row():
	input_mode = gr.Radio(choices=["Text", "Audio", "File"], label="Select input mode", value="Text")

	input_text = gr.Textbox(label="Enter English text", lines=10, visible=True)
	audio_input = gr.Audio(label="Upload audio (.wav, .mp3, .m4a)", type="filepath", visible=False)
	file_input = gr.File(file_types=['.pdf', '.docx', '.html', '.htm', '.md', '.srt', '.txt'], label="Upload document", visible=False)

	extracted_text = gr.Textbox(label="Extracted / Transcribed Text", lines=10, interactive=False)
	translate_button = gr.Button("Translate to Wolof")
	output_text = gr.Textbox(label="Translated Wolof Text", lines=10, interactive=False)

	def update_visibility(mode):
	return {
	input_text: gr.update(visible=(mode=="Text")),
	audio_input: gr.update(visible=(mode=="Audio")),
	file_input: gr.update(visible=(mode=="File")),
	extracted_text: gr.update(value="", visible=True),
	output_text: gr.update(value="")
	}

	input_mode.change(fn=update_visibility, inputs=input_mode, outputs=[input_text, audio_input, file_input, extracted_text, output_text])

	def handle_process(mode, text, audio, file_obj):
	try:
	extracted = process_input(mode, text, audio, file_obj)
	return extracted, ""
	except Exception as e:
	return "", f"Error: {str(e)}"

	translate_button.click(fn=handle_process, inputs=[input_mode, input_text, audio_input, file_input], outputs=[extracted_text, output_text])

	def handle_translate(text):
	return translate_and_return(text)

	translate_button.click(fn=handle_translate, inputs=extracted_text, outputs=output_text)

	demo.launch()