Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pytesseract | |
| from PIL import Image | |
| from transformers import MarianMTModel, MarianTokenizer | |
| from nltk.tokenize import sent_tokenize | |
| import nltk | |
| nltk.download('punkt') | |
| # OCR function | |
| def ocr_image(image, language): | |
| if image is None: | |
| return "Please upload an image." | |
| lang = '+'.join(language) | |
| text = pytesseract.image_to_string(image, lang=lang) | |
| return f"OCR Text of the image:\n\n{text.strip()}" | |
| # Translation function | |
| def translate_text(text, direction): | |
| if not text.strip(): | |
| return "No text to translate." | |
| src, tgt = direction.split("-") | |
| model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}" | |
| tokenizer = MarianTokenizer.from_pretrained(model_name) | |
| model = MarianMTModel.from_pretrained(model_name) | |
| sentences = sent_tokenize(text) | |
| inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True) | |
| outputs = model.generate(**inputs) | |
| translated = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs] | |
| return "\n".join(translated).strip() | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=ocr_image, | |
| inputs=[ | |
| gr.Image(type="pil", label="Image"), | |
| gr.CheckboxGroup(choices=["eng", "chi_sim", "fra", "deu"], value=["eng"], label="OCR Language(s)") | |
| ], | |
| outputs="text", | |
| title="OCR Text Extractor" | |
| ) | |
| # Add translation separately | |
| translate_iface = gr.Interface( | |
| fn=translate_text, | |
| inputs=[ | |
| gr.Textbox(label="Text to Translate"), | |
| gr.Radio(choices=["en-zh", "zh-en", "en-fr", "fr-en"], value="en-zh", label="Translation Direction") | |
| ], | |
| outputs="text", | |
| title="Text Translator" | |
| ) | |
| # Combine both as a tabbed app | |
| gr.TabbedInterface([iface, translate_iface], ["OCR", "Translate"]).launch() |