import gradio as gr import torch from transformers import ( TrOCRProcessor, VisionEncoderDecoderModel, WhisperProcessor, WhisperForConditionalGeneration, MarianMTModel, MarianTokenizer, pipeline ) from PIL import Image import torchaudio import numpy as np print("Loading models...") # OCR ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed") ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed") # Speech to Text whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small") whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") # Translation trans_en_es_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es") trans_en_es_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-es") trans_es_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en") trans_es_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-es-en") # Sentiment sentiment = pipeline("sentiment-analysis") # Summarization summarizer = pipeline("summarization", model="facebook/bart-large-cnn") print("Models loaded successfully") def ocr_extract(image): if image is None: return "Error: No image" pixel_values = ocr_processor(image, return_tensors="pt").pixel_values generated_ids = ocr_model.generate(pixel_values) text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return text def speech_to_text(audio): if audio is None: return "Error: No audio" audio_array, sr = torchaudio.load(audio) if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) audio_array = resampler(audio_array) inputs = whisper_processor(audio_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt") generated_ids = whisper_model.generate(inputs.input_features) text = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return text def translate_en_to_es(text): if not text: return "Error: No text" inputs = trans_en_es_tokenizer(text, return_tensors="pt", padding=True) translated = trans_en_es_model.generate(**inputs) return trans_en_es_tokenizer.batch_decode(translated, skip_special_tokens=True)[0] def translate_es_to_en(text): if not text: return "Error: No text" inputs = trans_es_en_tokenizer(text, return_tensors="pt", padding=True) translated = trans_es_en_model.generate(**inputs) return trans_es_en_tokenizer.batch_decode(translated, skip_special_tokens=True)[0] def analyze_sentiment(text): if not text: return "Error: No text" result = sentiment(text)[0] return f"{result['label']}: {result['score']:.2%}" def summarize(text): if not text or len(text) < 100: return "Text too short" result = summarizer(text, max_length=130, min_length=30, do_sample=False) return result[0]['summary_text'] with gr.Blocks(title="AI Toolkit") as app: gr.Markdown("# AI Multi-Model Toolkit") with gr.Tab("OCR"): with gr.Row(): ocr_img = gr.Image(type="pil", label="Image") ocr_out = gr.Textbox(label="Text", lines=10) gr.Button("Extract").click(ocr_extract, ocr_img, ocr_out) with gr.Tab("Speech to Text"): with gr.Row(): audio_in = gr.Audio(type="filepath", label="Audio") audio_out = gr.Textbox(label="Transcription", lines=10) gr.Button("Transcribe").click(speech_to_text, audio_in, audio_out) with gr.Tab("Translation"): with gr.Row(): with gr.Column(): trans_in = gr.Textbox(label="Input", lines=5) with gr.Row(): btn_en_es = gr.Button("EN to ES") btn_es_en = gr.Button("ES to EN") trans_out = gr.Textbox(label="Output", lines=5) btn_en_es.click(translate_en_to_es, trans_in, trans_out) btn_es_en.click(translate_es_to_en, trans_in, trans_out) with gr.Tab("Sentiment"): with gr.Row(): sent_in = gr.Textbox(label="Text", lines=5) sent_out = gr.Textbox(label="Result", lines=2) gr.Button("Analyze").click(analyze_sentiment, sent_in, sent_out) with gr.Tab("Summarize"): with gr.Row(): summ_in = gr.Textbox(label="Long Text", lines=10) summ_out = gr.Textbox(label="Summary", lines=5) gr.Button("Summarize").click(summarize, summ_in, summ_out) app.launch()