Spaces:

piyushgrover
/

MultiModelGPT

Paused

App Files Files Community

piyushgrover commited on Mar 8, 2025

Commit

d3dc36c

verified ·

1 Parent(s): 632fd1a

Update app.py

Browse files

Files changed (1) hide show

app.py +276 -54

app.py CHANGED Viewed

@@ -1,74 +1,296 @@
-import torch
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from peft import PeftModel
-# ✅ Model and Tokenizer Loading
-model_name = "microsoft/phi-2"
-#device_map = {"": 0}
-# Load base model
-base_model = AutoModelForCausalLM.from_pretrained(
-    model_name,
     low_cpu_mem_usage=True,
     return_dict=True,
-    torch_dtype=torch.float16,
-    trust_remote_code=True,
-    device_map="auto",
 )
-# Load fine-tuned LoRA weights
-fine_tuned_model_path = "piyushgrover/phi2-qlora-adapter-s18erav3"
-model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)
-model = model.merge_and_unload()  # Merge LoRA weights
-# ✅ Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right"
-# ✅ Set up text generation pipeline
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=500, truncation=True)
-def chat(user_input, history=[]):
-    """Generates a response from the fine-tuned Phi-2 model with conversation memory."""
-    '''
-    # Format conversation history
-    formatted_history = ""
-    for usr, bot in history:
-        formatted_history += f"\n\n### User:\n{usr}\n\n### Assistant:\n{bot}"
-    # Append the latest user message
-    prompt = f"{formatted_history}\n\n### User:\n{user_input}\n\n### Assistant:\n"
-    # Generate response
-    response = generator(prompt, max_length=128, do_sample=True, truncation=True)
-    answer = response[0]["generated_text"].split("### Assistant:\n")[-1].strip()
-    # Append new response to history
-    #history.append((user_input, answer))
-    return answer
-    '''
-    prompt = f"\n\n### User:\n{user_input}\n\n### Assistant:\n"
-    response = generator(prompt, max_length=128, do_sample=True, truncation=True)
-    answer = response[0]["generated_text"].split("### Assistant:\n")[-1].strip()
-    # Append new response to history
-    # history.append((user_input, answer))
-    return answer
-# ✅ Create Gradio Chat Interface
-chatbot = gr.ChatInterface(
-    fn=chat,
-    title="Fine-Tuned Phi-2 Conversational Chat Assistant",
-    description="🚀 Chat with a fine-tuned Phi-2 model. It remembers the conversation!",
-    theme="compact",
-)
-# ✅ Launch App
-if __name__ == "__main__":
-    chatbot.launch(debug=True)

 import gradio as gr
+import os
+import time
+from PIL import Image
+import torch
+import whisperx
+from transformers import CLIPVisionModel, CLIPImageProcessor, AutoModelForCausalLM, AutoTokenizer
+from models.vision_projector_model import VisionProjector
+from config import VisionProjectorConfig, app_config as cfg
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+vision_projector = VisionProjector(VisionProjectorConfig())
+ckpt = torch.load(cfg['vision_projector_file'], map_location=torch.device(device))
+vision_projector.load_state_dict(ckpt['model_state_dict'])
+phi_base_model = AutoModelForCausalLM.from_pretrained(
+    'microsoft/phi-2',
     low_cpu_mem_usage=True,
     return_dict=True,
+    torch_dtype=torch.float32,
+    trust_remote_code=True
+    # device_map=device_map,
 )
+from peft import PeftModel
+phi_new_model = "models/phi_adapter"
+phi_model = PeftModel.from_pretrained(phi_base_model, phi_new_model)
+phi_model = phi_model.merge_and_unload().to(device)
+'''compute_type = 'float32'
+if device != 'cpu':
+    compute_type = 'float16'''
+audi_model = whisperx.load_model("small", device, compute_type='float16')
+tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-2')
+tokenizer.pad_token = tokenizer.unk_token
+### app functions ##
+context_added = False
+query_added = False
+context = None
+context_type = ''
+query = ''
+bot_active = False
+def print_like_dislike(x: gr.LikeData):
+    print(x.index, x.value, x.liked)
+def add_text(history, text):
+    global context, context_type, context_added, query, query_added
+    context_added = False
+    if not context_type and '</context>' not in text:
+        context = "**Please add context (upload image/audio or enter text followed by \</context\>"
+        context_type = 'error'
+        context_added = True
+        query_added = False
+    elif '</context>' in text:
+            context_type = 'text'
+            context_added = True
+            text = text.replace('</context>', ' ')
+            context = text
+            query_added = False
+    elif context_type in ['[text]', '[image]', '[audio]']:
+        query = 'Human### ' + text + '\n' + 'AI### '
+        query_added = True
+        context_added = False
+    else:
+        query_added = False
+        context_added = True
+        context = 'error'
+        context = "**Please provide a valid context**"
+    history = history + [(text, None)]
+    return history, gr.Textbox(value="", interactive=False)
+def add_file(history, file):
+    global context_added, context, context_type, query_added
+    context = file
+    context_type = 'image'
+    context_added = True
+    query_added = False
+    history = history + [((file.name,), None)]
+    return history
+def audio_upload(history, audio_file):
+    global context, context_type, context_added, query, query_added
+    if audio_file:
+        context_added = True
+        context_type = 'audio'
+        context = audio_file
+        query_added = False
+        history = history + [((audio_file,), None)]
+    else:
+        pass
+    return history
+def preprocess_fn(history):
+    global context, context_added, query, context_type, query_added
+    if context_added:
+        if context_type == 'image':
+            image = Image.open(context)
+            inputs = clip_processor(images=image, return_tensors="pt")
+            x = clip_model(**inputs, output_hidden_states=True)
+            image_features = x.hidden_states[-2]
+            context = vision_projector(image_features)
+        elif context_type == 'audio':
+            audio_file = context
+            audio = whisperx.load_audio(audio_file)
+            result = audi_model.transcribe(audio, batch_size=1)
+            error = False
+            if result.get('language', None) and result.get('segments', None):
+                try:
+                    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+                    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+                except Exception as e:
+                    error = True
+                print(result.get('language', None))
+                if not error and result.get('segments', []) and len(result["segments"]) > 0 and result["segments"][0].get('text', None):
+                    text = result["segments"][0].get('text', '')
+                    print(text)
+                    context_type = 'audio'
+                    context_added = True
+                    context = text
+                    query_added = False
+                    print(context)
+                else:
+                    error = True
+            else:
+                error = True
+            if error:
+                context_type = 'error'
+                context_added = True
+                context = "**Please provide a valid audio file / context**"
+                query_added = False
+    print("Here")
+    return history
+def bot(history):
+    global context, context_added, query, context_type, query_added, bot_active
+    response = ''
+    if context_added:
+        context_added = False
+        if context_type == 'error':
+            response = context
+            query = ''
+        elif context_type in ['image', 'audio', 'text']:
+            response = ''
+            if context_type == 'audio':
+                response = 'Context: \n🗣 ' + '"_' + context.strip() + '_"\n\n'
+            response += "**Please proceed with your queries**"
+            query = ''
+            context_type = '[' + context_type + ']'
+    elif query_added:
+        query_added = False
+        if context_type == '[image]':
+            query_ids = tokenizer.encode(query)
+            query_ids = torch.tensor(query_ids, dtype=torch.int32).unsqueeze(0).to(device)
+            query_embeds = phi_model.get_input_embeddings()(query_ids)
+            inputs_embeds = torch.cat([context.to(device), query_embeds], dim=1)
+            out = phi_model.generate(inputs_embeds=inputs_embeds, min_new_tokens=10, max_new_tokens=50,
+                                     bos_token_id=tokenizer.bos_token_id)
+            response = tokenizer.decode(out[0], skip_special_tokens=True)
+        elif context_type in ['[text]', '[audio]']:
+            input_text = context + query
+            input_tokens = tokenizer.encode(input_text)
+            input_ids = torch.tensor(input_tokens, dtype=torch.int32).unsqueeze(0).to(device)
+            inputs_embeds = phi_model.get_input_embeddings()(input_ids)
+            out = phi_model.generate(inputs_embeds=inputs_embeds, min_new_tokens=10, max_new_tokens=50,
+                                     bos_token_id=tokenizer.bos_token_id)
+            response = tokenizer.decode(out[0], skip_special_tokens=True)
+        else:
+            query = ''
+            response = "**Please provide a valid context**"
+    if response:
+        bot_active = True
+        if history and len(history[-1]) > 1:
+            history[-1][1] = ""
+            for character in response:
+                history[-1][1] += character
+                time.sleep(0.05)
+                yield history
+            time.sleep(0.5)
+            bot_active = False
+def clear_fn():
+    global context_added, context_type, context, query, query_added
+    context_added = False
+    context_type = ''
+    context = None
+    query = ''
+    query_added = False
+    return {
+        chatbot: None
+    }
+with gr.Blocks() as app:
+    gr.Markdown(
+        """
+        # ContextGPT - A Multimodal chatbot
+        ### Upload image or audio to add a context. And then ask questions.
+        ### You can also enter text followed by \</context\> to set the context.
+        """
+    )
+    chatbot = gr.Chatbot(
+        [],
+        elem_id="chatbot",
+        bubble_full_width=False
+    )
+    with gr.Row():
+        txt = gr.Textbox(
+            scale=4,
+            show_label=False,
+            placeholder="Press enter to send ",
+            container=False,
+        )
+    with gr.Row():
+        aud = gr.Audio(sources=['microphone', 'upload'], type='filepath', max_length=100, show_download_button=True,
+                       show_share_button=True)
+        btn = gr.UploadButton("📷", file_types=["image"])
+    with gr.Row():
+        clear = gr.Button("Clear")
+    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        preprocess_fn, chatbot, chatbot
+    ).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
+    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(
+        preprocess_fn, chatbot, chatbot
+    ).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+    chatbot.like(print_like_dislike, None, None)
+    clear.click(clear_fn, None, chatbot, queue=False)
+    aud.stop_recording(audio_upload, [chatbot, aud], [chatbot], queue=False).then(
+        preprocess_fn, chatbot, chatbot
+    ).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+    aud.upload(audio_upload, [chatbot, aud], [chatbot], queue=False).then(
+        preprocess_fn, chatbot, chatbot
+    ).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+app.queue()
+app.launch()