Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import os | |
| from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM | |
| from IndicTransToolkit import IndicProcessor | |
| from gtts import gTTS | |
| import soundfile as sf | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| import PyPDF2 | |
| import tempfile | |
| from huggingface_hub import login | |
| # Authenticate with Hugging Face token | |
| if os.getenv("HF_TOKEN"): | |
| login(token=os.getenv("HF_TOKEN")) | |
| else: | |
| raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Spaces settings.") | |
| # Initialize BLIP for image captioning | |
| blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| # Initialize Mixtral-8x7B-Instruct for conversational tasks | |
| mixtral_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| mixtral_tokenizer = AutoTokenizer.from_pretrained(mixtral_model_name) | |
| mixtral_model = AutoModelForCausalLM.from_pretrained(mixtral_model_name) | |
| mixtral_model = torch.quantization.quantize_dynamic(mixtral_model, {torch.nn.Linear}, dtype=torch.qint8) | |
| # Initialize vector store and embeddings for RAG | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| vector_store = None | |
| temp_dir = tempfile.mkdtemp() | |
| def generate_caption(image): | |
| image = image.convert("RGB") | |
| inputs = blip_processor(image, "image of", return_tensors="pt") | |
| with torch.no_grad(): | |
| generated_ids = blip_model.generate(**inputs) | |
| caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True) | |
| return caption | |
| def translate_caption(caption, target_languages): | |
| model_name = "ai4bharat/indictrans2-en-indic-1B" | |
| tokenizer_IT2 = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| model_IT2 = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True) | |
| model_IT2 = torch.quantization.quantize_dynamic(model_IT2, {torch.nn.Linear}, dtype=torch.qint8) | |
| ip = IndicProcessor(inference=True) | |
| src_lang = "eng_Latn" | |
| input_sentences = [caption] | |
| translations = {} | |
| for tgt_lang in target_languages: | |
| batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang) | |
| inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt") | |
| with torch.no_grad(): | |
| generated_tokens = model_IT2.generate(**inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1) | |
| with tokenizer_IT2.as_target_tokenizer(): | |
| generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
| translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang) | |
| translations[tgt_lang] = translated_texts[0] | |
| return translations | |
| def generate_audio_gtts(text, lang_code): | |
| output_file = os.path.join(temp_dir, f"{lang_code}_gTTS.mp3") | |
| tts = gTTS(text=text, lang=lang_code) | |
| tts.save(output_file) | |
| return output_file | |
| def process_document(file): | |
| global vector_store | |
| if file.name.endswith(".pdf"): | |
| reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| else: | |
| text = file.read().decode("utf-8") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) | |
| chunks = text_splitter.split_text(text) | |
| documents = [Document(page_content=chunk) for chunk in chunks] | |
| vector_store = FAISS.from_documents(documents, embeddings) | |
| vector_store.save_local(os.path.join(temp_dir, "faiss_index")) | |
| return "Document processed and indexed successfully." | |
| def chat_with_llm(message, history): | |
| global vector_store | |
| context = "" | |
| if vector_store: | |
| docs = vector_store.similarity_search(message, k=2) | |
| context = "\n".join([doc.page_content for doc in docs]) | |
| prompt = f"[INST] You are a helpful assistant. Use the following context to answer the question accurately:\n\n{context}\n\nQuestion: {message} [/INST]" | |
| inputs = mixtral_tokenizer(prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = mixtral_model.generate(**inputs, max_length=500, num_return_sequences=1, temperature=0.7) | |
| response = mixtral_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return response.replace(prompt, "").strip() | |
| def image_tab(image, target_languages): | |
| if not image: | |
| return "Please upload an image.", {}, [] | |
| caption = generate_caption(image) | |
| translations = translate_caption(caption, target_languages) if target_languages else {} | |
| audio_files = [] | |
| for lang in target_languages: | |
| lang_code = {"hin_Deva": "hi", "guj_Gujr": "gu", "urd_Arab": "ur"}.get(lang, "en") | |
| audio_file = generate_audio_gtts(translations[lang], lang_code) | |
| audio_files.append(audio_file) | |
| return caption, translations, audio_files | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Multilingual Assistive Model") | |
| with gr.Tabs(): | |
| with gr.TabItem("Image Processing"): | |
| image_input = gr.Image(type="pil", label="Upload Image") | |
| lang_select = gr.CheckboxGroup(["hin_Deva", "guj_Gujr", "urd_Arab"], label="Select Target Languages", value=["hin_Deva"]) | |
| process_btn = gr.Button("Process Image") | |
| caption_output = gr.Textbox(label="Generated Caption") | |
| translation_output = gr.JSON(label="Translations") | |
| audio_output = gr.Files(label="Audio Files") | |
| process_btn.click(image_tab, inputs=[image_input, lang_select], outputs=[caption_output, translation_output, audio_output]) | |
| with gr.TabItem("Document Upload"): | |
| doc_input = gr.File(label="Upload Document (PDF or TXT)", file_types=[".pdf", ".txt"]) | |
| upload_btn = gr.Button("Process Document") | |
| doc_status = gr.Textbox(label="Status") | |
| upload_btn.click(process_document, inputs=doc_input, outputs=doc_status) | |
| with gr.TabItem("Chat with LLM"): | |
| chatbot = gr.Chatbot() | |
| msg = gr.Textbox(label="Your Message") | |
| clear = gr.Button("Clear") | |
| msg.submit(chat_with_llm, inputs=[msg, chatbot], outputs=chatbot) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| demo.launch() |