File size: 6,649 Bytes
b865f52
 
 
 
1316bcd
b865f52
 
 
 
 
 
 
 
 
c51d070
 
 
 
 
 
 
b865f52
 
1316bcd
 
b865f52
c51d070
 
 
 
 
b865f52
 
 
 
 
 
 
 
1316bcd
b865f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1316bcd
b865f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1316bcd
b865f52
 
 
 
 
 
 
 
 
 
1316bcd
b865f52
c51d070
 
b865f52
c51d070
 
b865f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1316bcd
b865f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1316bcd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import torch
from PIL import Image
import os
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from IndicTransToolkit import IndicProcessor
from gtts import gTTS
import soundfile as sf
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import PyPDF2
import tempfile
from huggingface_hub import login

# Authenticate with Hugging Face token
if os.getenv("HF_TOKEN"):
    login(token=os.getenv("HF_TOKEN"))
else:
    raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Spaces settings.")

# Initialize BLIP for image captioning
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Initialize Mixtral-8x7B-Instruct for conversational tasks
mixtral_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
mixtral_tokenizer = AutoTokenizer.from_pretrained(mixtral_model_name)
mixtral_model = AutoModelForCausalLM.from_pretrained(mixtral_model_name)
mixtral_model = torch.quantization.quantize_dynamic(mixtral_model, {torch.nn.Linear}, dtype=torch.qint8)

# Initialize vector store and embeddings for RAG
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = None
temp_dir = tempfile.mkdtemp()

def generate_caption(image):
    image = image.convert("RGB")
    inputs = blip_processor(image, "image of", return_tensors="pt")
    with torch.no_grad():
        generated_ids = blip_model.generate(**inputs)
    caption = blip_processor.decode(generated_ids[0], skip_special_tokens=True)
    return caption

def translate_caption(caption, target_languages):
    model_name = "ai4bharat/indictrans2-en-indic-1B"
    tokenizer_IT2 = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model_IT2 = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
    model_IT2 = torch.quantization.quantize_dynamic(model_IT2, {torch.nn.Linear}, dtype=torch.qint8)
    ip = IndicProcessor(inference=True)
    src_lang = "eng_Latn"
    input_sentences = [caption]
    translations = {}
    for tgt_lang in target_languages:
        batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
        inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt")
        with torch.no_grad():
            generated_tokens = model_IT2.generate(**inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1)
        with tokenizer_IT2.as_target_tokenizer():
            generated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
        translated_texts = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
        translations[tgt_lang] = translated_texts[0]
    return translations

def generate_audio_gtts(text, lang_code):
    output_file = os.path.join(temp_dir, f"{lang_code}_gTTS.mp3")
    tts = gTTS(text=text, lang=lang_code)
    tts.save(output_file)
    return output_file

def process_document(file):
    global vector_store
    if file.name.endswith(".pdf"):
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
    else:
        text = file.read().decode("utf-8")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    documents = [Document(page_content=chunk) for chunk in chunks]
    vector_store = FAISS.from_documents(documents, embeddings)
    vector_store.save_local(os.path.join(temp_dir, "faiss_index"))
    return "Document processed and indexed successfully."

def chat_with_llm(message, history):
    global vector_store
    context = ""
    if vector_store:
        docs = vector_store.similarity_search(message, k=2)
        context = "\n".join([doc.page_content for doc in docs])
    prompt = f"[INST] You are a helpful assistant. Use the following context to answer the question accurately:\n\n{context}\n\nQuestion: {message} [/INST]"
    inputs = mixtral_tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = mixtral_model.generate(**inputs, max_length=500, num_return_sequences=1, temperature=0.7)
    response = mixtral_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.replace(prompt, "").strip()

def image_tab(image, target_languages):
    if not image:
        return "Please upload an image.", {}, []
    caption = generate_caption(image)
    translations = translate_caption(caption, target_languages) if target_languages else {}
    audio_files = []
    for lang in target_languages:
        lang_code = {"hin_Deva": "hi", "guj_Gujr": "gu", "urd_Arab": "ur"}.get(lang, "en")
        audio_file = generate_audio_gtts(translations[lang], lang_code)
        audio_files.append(audio_file)
    return caption, translations, audio_files

with gr.Blocks() as demo:
    gr.Markdown("# Multilingual Assistive Model")
    with gr.Tabs():
        with gr.TabItem("Image Processing"):
            image_input = gr.Image(type="pil", label="Upload Image")
            lang_select = gr.CheckboxGroup(["hin_Deva", "guj_Gujr", "urd_Arab"], label="Select Target Languages", value=["hin_Deva"])
            process_btn = gr.Button("Process Image")
            caption_output = gr.Textbox(label="Generated Caption")
            translation_output = gr.JSON(label="Translations")
            audio_output = gr.Files(label="Audio Files")
            process_btn.click(image_tab, inputs=[image_input, lang_select], outputs=[caption_output, translation_output, audio_output])
        with gr.TabItem("Document Upload"):
            doc_input = gr.File(label="Upload Document (PDF or TXT)", file_types=[".pdf", ".txt"])
            upload_btn = gr.Button("Process Document")
            doc_status = gr.Textbox(label="Status")
            upload_btn.click(process_document, inputs=doc_input, outputs=doc_status)
        with gr.TabItem("Chat with LLM"):
            chatbot = gr.Chatbot()
            msg = gr.Textbox(label="Your Message")
            clear = gr.Button("Clear")
            msg.submit(chat_with_llm, inputs=[msg, chatbot], outputs=chatbot)
            clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()