Spaces:

VDNT11
/

MultilingualAssistive_LLM_RAG

Runtime error

App Files Files Community

VDNT11 commited on Apr 27, 2025

Commit

1316bcd

verified ·

1 Parent(s): 758690d

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -44

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import torch
 from PIL import Image
 import os
-from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, VitsTokenizer, VitsModel, AutoModelForCausalLM, set_seed
 from IndicTransToolkit import IndicProcessor
 from gtts import gTTS
 import soundfile as sf
@@ -12,27 +12,15 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.docstore.document import Document
 import PyPDF2
 import tempfile
-from huggingface_hub import login
-# Authenticate with Hugging Face token
-if os.getenv("HF_TOKEN"):
-    login(token=os.getenv("HF_TOKEN"))
-else:
-    raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Spaces settings.")
 # Initialize BLIP for image captioning
-blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
-# Initialize Mixtral-8x7B-Instruct for conversational tasks
-mixtral_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-mixtral_tokenizer = AutoTokenizer.from_pretrained(mixtral_model_name)
-mixtral_model = AutoModelForCausalLM.from_pretrained(
-    mixtral_model_name,
-    load_in_4bit=True,
-    device_map="auto",
-    torch_dtype=torch.bfloat16
-)
 # Initialize vector store and embeddings for RAG
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
@@ -41,7 +29,7 @@ temp_dir = tempfile.mkdtemp()
 def generate_caption(image):
     image = image.convert("RGB")
-    inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
     with torch.no_grad():
         generated_ids = blip_model.generate(**inputs)
     caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
@@ -54,13 +42,11 @@ def translate_caption(caption, target_languages):
     model_IT2 = torch.quantization.quantize_dynamic(model_IT2, {torch.nn.Linear}, dtype=torch.qint8)
     ip = IndicProcessor(inference=True)
     src_lang = "eng_Latn"
-    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-    model_IT2.to(DEVICE)
     input_sentences = [caption]
     translations = {}
     for tgt_lang in target_languages:
         batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
-        inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
         with torch.no_grad():
             generated_tokens = model_IT2.generate(**inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1)
         with tokenizer_IT2.as_target_tokenizer():
@@ -75,18 +61,6 @@ def generate_audio_gtts(text, lang_code):
     tts.save(output_file)
     return output_file
-def generate_audio_fbmms(text, model_name):
-    output_file = os.path.join(temp_dir, f"{model_name.split('/')[-1]}.wav")
-    tokenizer = VitsTokenizer.from_pretrained(model_name)
-    model = VitsModel.from_pretrained(model_name)
-    inputs = tokenizer(text=text, return_tensors="pt")
-    set_seed(555)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    waveform = outputs.waveform[0].cpu().numpy()
-    sf.write(output_file, waveform, samplerate=model.config.sampling_rate)
-    return output_file
 def process_document(file):
     global vector_store
     if file.name.endswith(".pdf"):
@@ -96,7 +70,7 @@ def process_document(file):
             text += page.extract_text() or ""
     else:
         text = file.read().decode("utf-8")
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     chunks = text_splitter.split_text(text)
     documents = [Document(page_content=chunk) for chunk in chunks]
     vector_store = FAISS.from_documents(documents, embeddings)
@@ -107,13 +81,13 @@ def chat_with_llm(message, history):
     global vector_store
     context = ""
     if vector_store:
-        docs = vector_store.similarity_search(message, k=3)
         context = "\n".join([doc.page_content for doc in docs])
-    prompt = f"[INST] You are a helpful assistant. Use the following context to answer the question accurately:\n\n{context}\n\nQuestion: {message} [/INST]"
-    inputs = mixtral_tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
     with torch.no_grad():
-        outputs = mixtral_model.generate(**inputs, max_length=1000, num_return_sequences=1, temperature=0.7)
-    response = mixtral_tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response.replace(prompt, "").strip()
 def image_tab(image, target_languages):
@@ -133,7 +107,7 @@ with gr.Blocks() as demo:
     with gr.Tabs():
         with gr.TabItem("Image Processing"):
             image_input = gr.Image(type="pil", label="Upload Image")
-            lang_select = gr.CheckboxGroup(["hin_Deva", "mar_Deva", "guj_Gujr", "urd_Arab"], label="Select Target Languages", value=["hin_Deva", "mar_Deva"])
             process_btn = gr.Button("Process Image")
             caption_output = gr.Textbox(label="Generated Caption")
             translation_output = gr.JSON(label="Translations")
@@ -151,5 +125,4 @@ with gr.Blocks() as demo:
             msg.submit(chat_with_llm, inputs=[msg, chatbot], outputs=chatbot)
             clear.click(lambda: None, None, chatbot, queue=False)
-demo.launch()

 import torch
 from PIL import Image
 import os
+from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
 from IndicTransToolkit import IndicProcessor
 from gtts import gTTS
 import soundfile as sf
 from langchain.docstore.document import Document
 import PyPDF2
 import tempfile
 # Initialize BLIP for image captioning
+blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Initialize Gemma-2B-Instruct for conversational tasks
+gemma_model_name = "google/gemma-2b-it"
+gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_name)
+gemma_model = AutoModelForCausalLM.from_pretrained(gemma_model_name)
 # Initialize vector store and embeddings for RAG
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 def generate_caption(image):
     image = image.convert("RGB")
+    inputs = blip_processor(image, "image of", return_tensors="pt")
     with torch.no_grad():
         generated_ids = blip_model.generate(**inputs)
     caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
     model_IT2 = torch.quantization.quantize_dynamic(model_IT2, {torch.nn.Linear}, dtype=torch.qint8)
     ip = IndicProcessor(inference=True)
     src_lang = "eng_Latn"
     input_sentences = [caption]
     translations = {}
     for tgt_lang in target_languages:
         batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
+        inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt")
         with torch.no_grad():
             generated_tokens = model_IT2.generate(**inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1)
         with tokenizer_IT2.as_target_tokenizer():
     tts.save(output_file)
     return output_file
 def process_document(file):
     global vector_store
     if file.name.endswith(".pdf"):
             text += page.extract_text() or ""
     else:
         text = file.read().decode("utf-8")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
     chunks = text_splitter.split_text(text)
     documents = [Document(page_content=chunk) for chunk in chunks]
     vector_store = FAISS.from_documents(documents, embeddings)
     global vector_store
     context = ""
     if vector_store:
+        docs = vector_store.similarity_search(message, k=2)
         context = "\n".join([doc.page_content for doc in docs])
+    prompt = f"<start_of_turn>user\nYou are a helpful assistant. Use the following context to answer the question accurately:\n\n{context}\n\nQuestion: {message}\n<end_of_turn>\n<start_of_turn>assistant"
+    inputs = gemma_tokenizer(prompt, return_tensors="pt")
     with torch.no_grad():
+        outputs = gemma_model.generate(**inputs, max_length=500, num_return_sequences=1, temperature=0.7)
+    response = gemma_tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response.replace(prompt, "").strip()
 def image_tab(image, target_languages):
     with gr.Tabs():
         with gr.TabItem("Image Processing"):
             image_input = gr.Image(type="pil", label="Upload Image")
+            lang_select = gr.CheckboxGroup(["hin_Deva", "guj_Gujr", "urd_Arab"], label="Select Target Languages", value=["hin_Deva"])
             process_btn = gr.Button("Process Image")
             caption_output = gr.Textbox(label="Generated Caption")
             translation_output = gr.JSON(label="Translations")
             msg.submit(chat_with_llm, inputs=[msg, chatbot], outputs=chatbot)
             clear.click(lambda: None, None, chatbot, queue=False)
+demo.launch()