import gradio as gr import torch from PIL import Image import os from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM from IndicTransToolkit import IndicProcessor from gtts import gTTS import soundfile as sf from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document import PyPDF2 import tempfile from huggingface_hub import login # Authenticate with Hugging Face token if os.getenv("HF_TOKEN"): login(token=os.getenv("HF_TOKEN")) else: raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Spaces settings.") # Initialize BLIP for image captioning blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Initialize Mixtral-8x7B-Instruct for conversational tasks mixtral_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" mixtral_tokenizer = AutoTokenizer.from_pretrained(mixtral_model_name) mixtral_model = AutoModelForCausalLM.from_pretrained(mixtral_model_name) mixtral_model = torch.quantization.quantize_dynamic(mixtral_model, {torch.nn.Linear}, dtype=torch.qint8) # Initialize vector store and embeddings for RAG embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vector_store = None temp_dir = tempfile.mkdtemp() def generate_caption(image): image = image.convert("RGB") inputs = blip_processor(image, "image of", return_tensors="pt") with torch.no_grad(): generated_ids = blip_model.generate(**inputs) caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True) return caption def translate_caption(caption, target_languages): model_name = "ai4bharat/indictrans2-en-indic-1B" tokenizer_IT2 = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model_IT2 = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True) model_IT2 = torch.quantization.quantize_dynamic(model_IT2, {torch.nn.Linear}, dtype=torch.qint8) ip = IndicProcessor(inference=True) src_lang = "eng_Latn" input_sentences = [caption] translations = {} for tgt_lang in target_languages: batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang) inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt") with torch.no_grad(): generated_tokens = model_IT2.generate(**inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1) with tokenizer_IT2.as_target_tokenizer(): generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True) translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang) translations[tgt_lang] = translated_texts[0] return translations def generate_audio_gtts(text, lang_code): output_file = os.path.join(temp_dir, f"{lang_code}_gTTS.mp3") tts = gTTS(text=text, lang=lang_code) tts.save(output_file) return output_file def process_document(file): global vector_store if file.name.endswith(".pdf"): reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() or "" else: text = file.read().decode("utf-8") text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) chunks = text_splitter.split_text(text) documents = [Document(page_content=chunk) for chunk in chunks] vector_store = FAISS.from_documents(documents, embeddings) vector_store.save_local(os.path.join(temp_dir, "faiss_index")) return "Document processed and indexed successfully." def chat_with_llm(message, history): global vector_store context = "" if vector_store: docs = vector_store.similarity_search(message, k=2) context = "\n".join([doc.page_content for doc in docs]) prompt = f"[INST] You are a helpful assistant. Use the following context to answer the question accurately:\n\n{context}\n\nQuestion: {message} [/INST]" inputs = mixtral_tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = mixtral_model.generate(**inputs, max_length=500, num_return_sequences=1, temperature=0.7) response = mixtral_tokenizer.decode(outputs[0], skip_special_tokens=True) return response.replace(prompt, "").strip() def image_tab(image, target_languages): if not image: return "Please upload an image.", {}, [] caption = generate_caption(image) translations = translate_caption(caption, target_languages) if target_languages else {} audio_files = [] for lang in target_languages: lang_code = {"hin_Deva": "hi", "guj_Gujr": "gu", "urd_Arab": "ur"}.get(lang, "en") audio_file = generate_audio_gtts(translations[lang], lang_code) audio_files.append(audio_file) return caption, translations, audio_files with gr.Blocks() as demo: gr.Markdown("# Multilingual Assistive Model") with gr.Tabs(): with gr.TabItem("Image Processing"): image_input = gr.Image(type="pil", label="Upload Image") lang_select = gr.CheckboxGroup(["hin_Deva", "guj_Gujr", "urd_Arab"], label="Select Target Languages", value=["hin_Deva"]) process_btn = gr.Button("Process Image") caption_output = gr.Textbox(label="Generated Caption") translation_output = gr.JSON(label="Translations") audio_output = gr.Files(label="Audio Files") process_btn.click(image_tab, inputs=[image_input, lang_select], outputs=[caption_output, translation_output, audio_output]) with gr.TabItem("Document Upload"): doc_input = gr.File(label="Upload Document (PDF or TXT)", file_types=[".pdf", ".txt"]) upload_btn = gr.Button("Process Document") doc_status = gr.Textbox(label="Status") upload_btn.click(process_document, inputs=doc_input, outputs=doc_status) with gr.TabItem("Chat with LLM"): chatbot = gr.Chatbot() msg = gr.Textbox(label="Your Message") clear = gr.Button("Clear") msg.submit(chat_with_llm, inputs=[msg, chatbot], outputs=chatbot) clear.click(lambda: None, None, chatbot, queue=False) demo.launch()