vinzur's picture
Add clear button
d6abe3a verified
import gradio as gr
import os
import faiss
import pickle
import numpy as np
import torch
from PIL import Image
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential
# Load models
text_encoder = SentenceTransformer('all-MiniLM-L6-v2')
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# Embedding functions
def embed_text(text):
return text_encoder.encode(text)
def embed_image(image_path):
image = Image.open(image_path).convert("RGB")
inputs = clip_processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = clip_model.get_image_features(**inputs)
return outputs.squeeze().cpu().numpy()
# Search + prompt
def semantic_search_and_prompt(query, top_k=5):
if isinstance(query, str) and os.path.exists(query):
query_embedding = embed_image(query).astype('float32').reshape(1, -1)
index = faiss.read_index("image_vector.index")
metadata_path = "image_vector.metadata"
else:
query_embedding = embed_text(query).astype('float32').reshape(1, -1)
index = faiss.read_index("text_vector.index")
metadata_path = "text_vector.metadata"
with open(metadata_path, "rb") as f:
metadata = pickle.load(f)
D, I = index.search(query_embedding, top_k)
top_k_chunks = [dict(metadata[i], score=float(D[0][j])) for j, i in enumerate(I[0])]
context = "\n\n".join([
f"[{chunk['type']} from page {chunk['page']} of {chunk['file']}]:\n{chunk.get('content', '')}"
for chunk in top_k_chunks
])
if isinstance(query, str) and not os.path.exists(query):
user_query = query
else:
user_query = "What is shown in this image?"
prompt = f"""
You are an expert assistant helping users answer questions based on a collection of documents.
Use the provided context chunks to answer the question accurately and clearly.
Context: {context}
Question: {user_query}
Answer:"""
return prompt, top_k_chunks
# Azure LLM setup
endpoint = "https://models.github.ai/inference"
model = "deepseek/DeepSeek-V3-0324"
token = os.getenv("GITHUB_TOKEN")
client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(token))
# Main pipeline for Gradio
def handle_query(text_input, image_input):
if image_input:
image_path = "query_image.png"
image_input.save(image_path)
query = image_path
elif text_input:
query = text_input
else:
return "Please provide input", None
prompt, chunks = semantic_search_and_prompt(query)
response = client.complete(
messages=[
SystemMessage(content="You are a helpful assistant."),
UserMessage(content=prompt),
],
temperature=1.0,
top_p=1.0,
max_tokens=1000,
model=model
)
answer = response.choices[0].message.content
references = "\n".join([
f"- **{chunk['file']}** | Page {chunk['page']} | Type: *{chunk['type']}* | Score: `{chunk['score']:.2f}`"
for chunk in chunks
])
return answer, references
# Gradio UI
def launch_app():
with gr.Blocks() as demo:
gr.Markdown("## 📄🎓Multimodal Chatbot for FAST-NUCES")
with gr.Row():
text_input = gr.Textbox(label="Enter your query")
image_input = gr.Image(label="Upload an image", type="pil")
with gr.Row():
btn = gr.Button("Submit")
btn_clear = gr.Button("Clear")
gr.Markdown("### 🧠 LLM Response:")
answer_output = gr.Markdown()
with gr.Accordion("📚 Source References", open=False):
reference_output = gr.Markdown()
# Submit button
btn.click(fn=handle_query,inputs=[text_input, image_input],outputs=[answer_output, reference_output])
# Clear button
btn_clear.click(fn=lambda: ("", None, "", ""),inputs=[],outputs=[text_input, image_input, answer_output, reference_output])
demo.launch()
if __name__ == "__main__":
launch_app()