VDNT11's picture
Update app.py
c51d070 verified
import gradio as gr
import torch
from PIL import Image
import os
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from IndicTransToolkit import IndicProcessor
from gtts import gTTS
import soundfile as sf
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import PyPDF2
import tempfile
from huggingface_hub import login
# Authenticate with Hugging Face token
if os.getenv("HF_TOKEN"):
login(token=os.getenv("HF_TOKEN"))
else:
raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Spaces settings.")
# Initialize BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Initialize Mixtral-8x7B-Instruct for conversational tasks
mixtral_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
mixtral_tokenizer = AutoTokenizer.from_pretrained(mixtral_model_name)
mixtral_model = AutoModelForCausalLM.from_pretrained(mixtral_model_name)
mixtral_model = torch.quantization.quantize_dynamic(mixtral_model, {torch.nn.Linear}, dtype=torch.qint8)
# Initialize vector store and embeddings for RAG
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = None
temp_dir = tempfile.mkdtemp()
def generate_caption(image):
image = image.convert("RGB")
inputs = blip_processor(image, "image of", return_tensors="pt")
with torch.no_grad():
generated_ids = blip_model.generate(**inputs)
caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
return caption
def translate_caption(caption, target_languages):
model_name = "ai4bharat/indictrans2-en-indic-1B"
tokenizer_IT2 = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model_IT2 = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
model_IT2 = torch.quantization.quantize_dynamic(model_IT2, {torch.nn.Linear}, dtype=torch.qint8)
ip = IndicProcessor(inference=True)
src_lang = "eng_Latn"
input_sentences = [caption]
translations = {}
for tgt_lang in target_languages:
batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt")
with torch.no_grad():
generated_tokens = model_IT2.generate(**inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1)
with tokenizer_IT2.as_target_tokenizer():
generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
translations[tgt_lang] = translated_texts[0]
return translations
def generate_audio_gtts(text, lang_code):
output_file = os.path.join(temp_dir, f"{lang_code}_gTTS.mp3")
tts = gTTS(text=text, lang=lang_code)
tts.save(output_file)
return output_file
def process_document(file):
global vector_store
if file.name.endswith(".pdf"):
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
else:
text = file.read().decode("utf-8")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(text)
documents = [Document(page_content=chunk) for chunk in chunks]
vector_store = FAISS.from_documents(documents, embeddings)
vector_store.save_local(os.path.join(temp_dir, "faiss_index"))
return "Document processed and indexed successfully."
def chat_with_llm(message, history):
global vector_store
context = ""
if vector_store:
docs = vector_store.similarity_search(message, k=2)
context = "\n".join([doc.page_content for doc in docs])
prompt = f"[INST] You are a helpful assistant. Use the following context to answer the question accurately:\n\n{context}\n\nQuestion: {message} [/INST]"
inputs = mixtral_tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = mixtral_model.generate(**inputs, max_length=500, num_return_sequences=1, temperature=0.7)
response = mixtral_tokenizer.decode(outputs[0], skip_special_tokens=True)
return response.replace(prompt, "").strip()
def image_tab(image, target_languages):
if not image:
return "Please upload an image.", {}, []
caption = generate_caption(image)
translations = translate_caption(caption, target_languages) if target_languages else {}
audio_files = []
for lang in target_languages:
lang_code = {"hin_Deva": "hi", "guj_Gujr": "gu", "urd_Arab": "ur"}.get(lang, "en")
audio_file = generate_audio_gtts(translations[lang], lang_code)
audio_files.append(audio_file)
return caption, translations, audio_files
with gr.Blocks() as demo:
gr.Markdown("# Multilingual Assistive Model")
with gr.Tabs():
with gr.TabItem("Image Processing"):
image_input = gr.Image(type="pil", label="Upload Image")
lang_select = gr.CheckboxGroup(["hin_Deva", "guj_Gujr", "urd_Arab"], label="Select Target Languages", value=["hin_Deva"])
process_btn = gr.Button("Process Image")
caption_output = gr.Textbox(label="Generated Caption")
translation_output = gr.JSON(label="Translations")
audio_output = gr.Files(label="Audio Files")
process_btn.click(image_tab, inputs=[image_input, lang_select], outputs=[caption_output, translation_output, audio_output])
with gr.TabItem("Document Upload"):
doc_input = gr.File(label="Upload Document (PDF or TXT)", file_types=[".pdf", ".txt"])
upload_btn = gr.Button("Process Document")
doc_status = gr.Textbox(label="Status")
upload_btn.click(process_document, inputs=doc_input, outputs=doc_status)
with gr.TabItem("Chat with LLM"):
chatbot = gr.Chatbot()
msg = gr.Textbox(label="Your Message")
clear = gr.Button("Clear")
msg.submit(chat_with_llm, inputs=[msg, chatbot], outputs=chatbot)
clear.click(lambda: None, None, chatbot, queue=False)
demo.launch()