Spaces:
Runtime error
Runtime error
AsherKnight commited on
Commit ·
bddd1de
1
Parent(s): bebed16
Initial commit
Browse files- .huggingface/spaces.yaml +2 -0
- agents/__pycache__/agent1_image_issue.cpython-311.pyc +0 -0
- agents/__pycache__/agent2_tenancy_faq.cpython-311.pyc +0 -0
- agents/agent1_image_issue.py +61 -0
- agents/agent2_tenancy_faq.py +52 -0
- app.py +137 -0
- config/city_law_data.json +3 -0
- requirements.txt +8 -0
- utils/__pycache__/captioning.cpython-311.pyc +0 -0
- utils/__pycache__/llm_utils.cpython-311.pyc +0 -0
- utils/__pycache__/routing.cpython-311.pyc +0 -0
- utils/captioning.py +107 -0
- utils/llm_utils.py +45 -0
- utils/routing.py +32 -0
.huggingface/spaces.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sdk: streamlit
|
| 2 |
+
hardware: gpu
|
agents/__pycache__/agent1_image_issue.cpython-311.pyc
ADDED
|
Binary file (1.67 kB). View file
|
|
|
agents/__pycache__/agent2_tenancy_faq.cpython-311.pyc
ADDED
|
Binary file (2.85 kB). View file
|
|
|
agents/agent1_image_issue.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .agent2_tenancy_faq import handle_tenancy_query
|
| 2 |
+
# Image analysis + troubleshooting agent
|
| 3 |
+
CLARITY_THRESHOLD = 0.1
|
| 4 |
+
|
| 5 |
+
CLARIFYING_QUESTION_PROMPT = (
|
| 6 |
+
"I observed something in the image, but I'm not entirely sure what the issue is. "
|
| 7 |
+
"Could you tell me more about what concerns you in this image?"
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
from utils.captioning import ImageCaptioning
|
| 11 |
+
from utils.llm_utils import LLaMAHelper
|
| 12 |
+
|
| 13 |
+
captioner = ImageCaptioning()
|
| 14 |
+
llm = LLaMAHelper()
|
| 15 |
+
|
| 16 |
+
def handle_image_issue(user_input, image, history=[], context={}):
|
| 17 |
+
if context.get("last_caption_data"):
|
| 18 |
+
caption, confidence = context["last_caption_data"]
|
| 19 |
+
include_caption = False # image was already processed
|
| 20 |
+
else:
|
| 21 |
+
caption, confidence = captioner.get_best_caption(image)
|
| 22 |
+
context["last_caption_data"] = (caption, confidence)
|
| 23 |
+
include_caption = True # this is the first time we're seeing this image
|
| 24 |
+
|
| 25 |
+
if confidence < CLARITY_THRESHOLD:
|
| 26 |
+
return CLARIFYING_QUESTION_PROMPT
|
| 27 |
+
|
| 28 |
+
user_context = "\n".join(f"User: {q}\nBot: {a}" for q, a in history)
|
| 29 |
+
|
| 30 |
+
full_input = ""
|
| 31 |
+
if include_caption:
|
| 32 |
+
full_input += f"Possible Image description: {caption}\n"
|
| 33 |
+
full_input += (
|
| 34 |
+
f"User Input: {user_input}\n"
|
| 35 |
+
f"Previous context of the conversation (keep it in hindsight): {user_context}\n"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
system_prompt = """You are a property expert who analyzes property images, user inputs, and context to identify visible issues and suggest practical fixes immediately.
|
| 39 |
+
|
| 40 |
+
Your goals:
|
| 41 |
+
- Identify any clear issue from the image or the current user input.
|
| 42 |
+
- Suggest practical, actionable steps to fix or investigate the issue — **as soon as it's identifiable**.
|
| 43 |
+
- Use the previous conversation context **to support your understanding**, but **always prioritize the most recent user input if it contradicts earlier context**.
|
| 44 |
+
- Ask a follow-up question **only if you absolutely need more detail to provide a helpful or safe recommendation**.
|
| 45 |
+
|
| 46 |
+
IMPORTANT:
|
| 47 |
+
- You are speaking directly to the user — do not use third-person language.
|
| 48 |
+
- Assume the user is always concerned with knowing the fixes to the problem being discussed or diagnosed. That’s what you should stay focused on.
|
| 49 |
+
- Do not get carried away — focus on diagnosing the issue and providing clear fixes.
|
| 50 |
+
- Previous conversation context is provided **only** to help you make better suggestions — **do not reference the "previous conversation" explicitly to the user**.
|
| 51 |
+
- Your primary task is to help the user by giving practical suggestions, solutions, and fixes.
|
| 52 |
+
- **Do not delay suggestions** if you already have enough information to make a confident recommendation.
|
| 53 |
+
- If the user's latest input contradicts earlier context, **trust the current input and clarify only if needed**.
|
| 54 |
+
- Avoid unnecessary follow-up questions — ask only if you truly need more details to help effectively.
|
| 55 |
+
|
| 56 |
+
Now, analyze the image, user input, and context. Suggest a fix immediately if you can. Use context to support your response, but **always prioritize the user's most recent input**. Ask follow-up questions only if absolutely necessary and only when required to provide a helpful recommendation.
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
reply = llm.chat(system_prompt, full_input, temperature=0.46)
|
| 61 |
+
return reply
|
agents/agent2_tenancy_faq.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.llm_utils import LLaMAHelper
|
| 2 |
+
import spacy
|
| 3 |
+
from geotext import GeoText
|
| 4 |
+
|
| 5 |
+
# Initialize LLaMA and spaCy
|
| 6 |
+
llm = LLaMAHelper()
|
| 7 |
+
nlp = spacy.load("en_core_web_sm")
|
| 8 |
+
|
| 9 |
+
def extract_location(text, method="spacy"):
|
| 10 |
+
if method == "spacy":
|
| 11 |
+
doc = nlp(text)
|
| 12 |
+
locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
|
| 13 |
+
return locations[0] if locations else ""
|
| 14 |
+
elif method == "geotext":
|
| 15 |
+
geo = GeoText(text)
|
| 16 |
+
locations = geo.countries + geo.cities
|
| 17 |
+
return locations[0] if locations else ""
|
| 18 |
+
return ""
|
| 19 |
+
|
| 20 |
+
def get_cached_location_from_history(history, method="spacy"):
|
| 21 |
+
for question, _ in reversed(history):
|
| 22 |
+
location = extract_location(question, method)
|
| 23 |
+
if location:
|
| 24 |
+
return location
|
| 25 |
+
return ""
|
| 26 |
+
|
| 27 |
+
def handle_tenancy_query(user_query, user_context, history=[], location_method="spacy"):
|
| 28 |
+
# Use stored location if available
|
| 29 |
+
location = user_context.get("location", "")
|
| 30 |
+
|
| 31 |
+
# Otherwise, extract from current or previous queries
|
| 32 |
+
if not location:
|
| 33 |
+
location = extract_location(user_query, location_method)
|
| 34 |
+
if not location:
|
| 35 |
+
location = get_cached_location_from_history(history, location_method)
|
| 36 |
+
|
| 37 |
+
if location:
|
| 38 |
+
user_context["location"] = location
|
| 39 |
+
|
| 40 |
+
system_prompt = "You are a legal assistant specializing in tenancy laws."
|
| 41 |
+
prompt=""
|
| 42 |
+
if location:
|
| 43 |
+
prompt += f" The user is from {location}."
|
| 44 |
+
|
| 45 |
+
if history:
|
| 46 |
+
chat_context = "\n".join(f"User: {q}\nBot: {a}" for q, a in history)
|
| 47 |
+
prompt += f"\n\nPrevious conversation:\n{chat_context}"
|
| 48 |
+
|
| 49 |
+
prompt += f"\n\nUser's current question: {user_query}\n\nGive a concise and helpful answer. If needed, ask a follow-up question to clarify."
|
| 50 |
+
print(f'prompt for tenacy faq is {prompt}')
|
| 51 |
+
reply = llm.chat(system_prompt, prompt, temperature=0.7)
|
| 52 |
+
return reply
|
app.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from utils.routing import route_agent
|
| 3 |
+
from agents.agent1_image_issue import handle_image_issue
|
| 4 |
+
from agents.agent2_tenancy_faq import handle_tenancy_query
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
import hashlib
|
| 8 |
+
|
| 9 |
+
# Helper to generate MD5 hash from image
|
| 10 |
+
def get_image_hash(image):
|
| 11 |
+
return hashlib.md5(image.tobytes()).hexdigest()
|
| 12 |
+
|
| 13 |
+
# Main query handler
|
| 14 |
+
def handle_query(user_input, image=None, location='', history=[], context={}):
|
| 15 |
+
try:
|
| 16 |
+
response_ui_msg = ""
|
| 17 |
+
|
| 18 |
+
# Initialize context if missing
|
| 19 |
+
context.setdefault("images", [])
|
| 20 |
+
context.setdefault("image_hashes", [])
|
| 21 |
+
context.setdefault("last_agent", None)
|
| 22 |
+
context.setdefault("last_caption_data", None)
|
| 23 |
+
|
| 24 |
+
# If there's a new image
|
| 25 |
+
if image is not None:
|
| 26 |
+
new_hash = get_image_hash(image)
|
| 27 |
+
if len(context["image_hashes"]) == 0 or new_hash != context["image_hashes"][-1]:
|
| 28 |
+
context["images"].append(image)
|
| 29 |
+
context["image_hashes"].append(new_hash)
|
| 30 |
+
context["location"] = ""
|
| 31 |
+
context["last_caption_data"] = None # Reset cached caption
|
| 32 |
+
response_ui_msg = "(New image attached. Starting image-related discussion.)"
|
| 33 |
+
|
| 34 |
+
# If image is removed mid-chat
|
| 35 |
+
if image is None and context["images"]:
|
| 36 |
+
response_ui_msg = "(Image removed. Continuing as text-only query.)"
|
| 37 |
+
|
| 38 |
+
# Use location if no image context
|
| 39 |
+
if not context["images"] and location:
|
| 40 |
+
context["location"] = location
|
| 41 |
+
|
| 42 |
+
# Determine which agent should handle the query
|
| 43 |
+
is_image_context = bool(context["images"])
|
| 44 |
+
agent = route_agent(user_input, is_image_context)
|
| 45 |
+
|
| 46 |
+
# Agent switch handling
|
| 47 |
+
if context["last_agent"] == 'agent1' and agent == 'agent2':
|
| 48 |
+
response_ui_msg += "\n(Switching to tenancy discussion based on your query...)"
|
| 49 |
+
|
| 50 |
+
elif context["last_agent"] == 'agent2' and agent == 'agent1':
|
| 51 |
+
response_ui_msg += "\n(Detected switch to image-based issue. Starting a new conversation...)"
|
| 52 |
+
history.clear()
|
| 53 |
+
context.clear()
|
| 54 |
+
context["images"] = [image] if image else []
|
| 55 |
+
context["image_hashes"] = [get_image_hash(image)] if image else []
|
| 56 |
+
context["last_caption_data"] = None
|
| 57 |
+
context["last_agent"] = None
|
| 58 |
+
context["location"] = location or ""
|
| 59 |
+
|
| 60 |
+
# Update current agent
|
| 61 |
+
context["last_agent"] = agent
|
| 62 |
+
|
| 63 |
+
# Run the correct agent
|
| 64 |
+
if agent == 'agent1':
|
| 65 |
+
if context["images"]:
|
| 66 |
+
result = handle_image_issue(user_input, context["images"][-1], history, context)
|
| 67 |
+
else:
|
| 68 |
+
result = "No image found to analyze."
|
| 69 |
+
else:
|
| 70 |
+
result = handle_tenancy_query(user_input, {"location": context.get("location")}, history)
|
| 71 |
+
|
| 72 |
+
# Add message to response
|
| 73 |
+
if response_ui_msg:
|
| 74 |
+
result = f"{response_ui_msg}\n\n{result}"
|
| 75 |
+
|
| 76 |
+
history.append((user_input, result))
|
| 77 |
+
return result, history, context, "🟢 Chat Ongoing"
|
| 78 |
+
|
| 79 |
+
except RuntimeError as e:
|
| 80 |
+
if "CUDA out of memory" in str(e):
|
| 81 |
+
error_msg = "⚠️ CUDA Out of Memory! Please try again later or reduce the image size."
|
| 82 |
+
return error_msg, history, context, "🔴 Error"
|
| 83 |
+
else:
|
| 84 |
+
raise e
|
| 85 |
+
|
| 86 |
+
# Reset function
|
| 87 |
+
def reset_chat():
|
| 88 |
+
return "", "", None, [], {"location": "", "images": [], "image_hashes": []}, "🟡 New Chat Started", ""
|
| 89 |
+
|
| 90 |
+
# Clear just the conversation history
|
| 91 |
+
def clear_chat_history():
|
| 92 |
+
return [], "", "🧹 Chat history cleared"
|
| 93 |
+
|
| 94 |
+
# Build the Gradio interface
|
| 95 |
+
with gr.Blocks() as demo:
|
| 96 |
+
conversation_history = gr.State([])
|
| 97 |
+
user_context = gr.State({"location": "", "images": [], "image_hashes": []})
|
| 98 |
+
session_state = gr.State("🟡 New Chat Started")
|
| 99 |
+
|
| 100 |
+
gr.Markdown("# 🏠 Multi-Agent Real Estate Chatbot")
|
| 101 |
+
gr.Markdown("Ask about property issues (with images) or tenancy questions!")
|
| 102 |
+
|
| 103 |
+
with gr.Row():
|
| 104 |
+
with gr.Column():
|
| 105 |
+
user_input = gr.Textbox(label="Enter your question:")
|
| 106 |
+
location_input = gr.Textbox(label="Enter your city or country (optional):")
|
| 107 |
+
image_input = gr.Image(type="pil", label="Upload an image (optional):")
|
| 108 |
+
|
| 109 |
+
submit_btn = gr.Button("Submit")
|
| 110 |
+
new_chat_btn = gr.Button("🔁 Start New Chat")
|
| 111 |
+
clear_history_btn = gr.Button("🧹 Clear Chat History")
|
| 112 |
+
|
| 113 |
+
with gr.Column():
|
| 114 |
+
chatbot_output = gr.Textbox(label="Chatbot Response:", interactive=False, lines=8)
|
| 115 |
+
session_indicator = gr.Textbox(label="Session Status", interactive=False)
|
| 116 |
+
|
| 117 |
+
# Hook button logic
|
| 118 |
+
submit_btn.click(
|
| 119 |
+
handle_query,
|
| 120 |
+
inputs=[user_input, image_input, location_input, conversation_history, user_context],
|
| 121 |
+
outputs=[chatbot_output, conversation_history, user_context, session_indicator]
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
new_chat_btn.click(
|
| 125 |
+
reset_chat,
|
| 126 |
+
inputs=[],
|
| 127 |
+
outputs=[user_input, location_input, image_input, conversation_history, user_context, session_indicator, chatbot_output]
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
clear_history_btn.click(
|
| 131 |
+
clear_chat_history,
|
| 132 |
+
inputs=[],
|
| 133 |
+
outputs=[conversation_history, chatbot_output, session_indicator]
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Launch app
|
| 137 |
+
demo.launch(share=True)
|
config/city_law_data.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"New York": {"notice_period": "30 days"}
|
| 3 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
torch
|
| 3 |
+
transformers
|
| 4 |
+
Pillow
|
| 5 |
+
gradio
|
| 6 |
+
ultralytics
|
| 7 |
+
spacy
|
| 8 |
+
geotext
|
utils/__pycache__/captioning.cpython-311.pyc
ADDED
|
Binary file (6.7 kB). View file
|
|
|
utils/__pycache__/llm_utils.cpython-311.pyc
ADDED
|
Binary file (2.31 kB). View file
|
|
|
utils/__pycache__/routing.cpython-311.pyc
ADDED
|
Binary file (845 Bytes). View file
|
|
|
utils/captioning.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, AutoModelForCausalLM
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import torch
|
| 4 |
+
from ultralytics import YOLO # You need to install: pip install ultralytics
|
| 5 |
+
from transformers import CLIPProcessor, CLIPModel
|
| 6 |
+
|
| 7 |
+
class ImageCaptioning:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
|
| 11 |
+
# Load BLIP
|
| 12 |
+
self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 13 |
+
self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
|
| 14 |
+
|
| 15 |
+
# Load GIT
|
| 16 |
+
self.git_processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
| 17 |
+
self.git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-base").to(self.device)
|
| 18 |
+
|
| 19 |
+
self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 20 |
+
self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
|
| 21 |
+
|
| 22 |
+
# # Load YOLO
|
| 23 |
+
# self.yolo_model = YOLO("yolov8n.pt") # You can use yolov8s.pt or others
|
| 24 |
+
|
| 25 |
+
def generate_caption_blip(self, image):
|
| 26 |
+
inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
|
| 27 |
+
print(f"Inputs keys: {inputs.keys()}")
|
| 28 |
+
with torch.no_grad():
|
| 29 |
+
output = self.blip_model.generate(**inputs)
|
| 30 |
+
caption = self.blip_processor.decode(output[0], skip_special_tokens=True)
|
| 31 |
+
return caption, self.compute_logprob(self.blip_model, inputs, output, self.blip_processor)
|
| 32 |
+
|
| 33 |
+
def generate_caption_git(self, image):
|
| 34 |
+
inputs = self.git_processor(images=image, return_tensors="pt").to(self.device)
|
| 35 |
+
print(f"Inputs keys: {inputs.keys()}")
|
| 36 |
+
with torch.no_grad():
|
| 37 |
+
generated_ids = self.git_model.generate(**inputs)
|
| 38 |
+
caption = self.git_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 39 |
+
return caption, self.compute_logprob(self.git_model, inputs, generated_ids, self.git_processor)
|
| 40 |
+
|
| 41 |
+
# def generate_caption_yolo(self, image):
|
| 42 |
+
# # Run YOLO detection
|
| 43 |
+
# results = self.yolo_model(image)
|
| 44 |
+
# detections = results[0].boxes.data # [x1, y1, x2, y2, conf, class]
|
| 45 |
+
# names = results[0].names
|
| 46 |
+
|
| 47 |
+
# if len(detections) == 0:
|
| 48 |
+
# return "No objects detected", 0.0
|
| 49 |
+
|
| 50 |
+
# # Get top class labels with confidence
|
| 51 |
+
# label_conf_pairs = [(names[int(cls)], float(conf)) for *_, conf, cls in detections]
|
| 52 |
+
# label_conf_pairs.sort(key=lambda x: x[1], reverse=True)
|
| 53 |
+
|
| 54 |
+
# top_labels = list({label for label, _ in label_conf_pairs[:5]}) # top 5 unique labels
|
| 55 |
+
# avg_conf = sum([conf for _, conf in label_conf_pairs[:5]]) / len(top_labels)
|
| 56 |
+
|
| 57 |
+
# caption = "Image contains: " + ", ".join(top_labels)
|
| 58 |
+
# return caption, avg_conf
|
| 59 |
+
|
| 60 |
+
def generate_caption_clip(self, image):
|
| 61 |
+
# Step 1: Generate caption candidates
|
| 62 |
+
caption_blip = self.generate_caption_blip(image)
|
| 63 |
+
caption_git = self.generate_caption_git(image)
|
| 64 |
+
candidates = [caption_blip, caption_git]
|
| 65 |
+
|
| 66 |
+
# Extract text-only for CLIP scoring
|
| 67 |
+
captions_only = [c[0] for c in candidates]
|
| 68 |
+
|
| 69 |
+
# Step 2: Score them with CLIP
|
| 70 |
+
inputs = self.clip_processor(text=captions_only, images=image, return_tensors="pt", padding=True).to(self.device)
|
| 71 |
+
with torch.no_grad():
|
| 72 |
+
outputs = self.clip_model(**inputs)
|
| 73 |
+
scores = outputs.logits_per_image[0] # shape: (num_captions,)
|
| 74 |
+
scores = scores.softmax(dim=0) # optional: normalize scores
|
| 75 |
+
|
| 76 |
+
best_idx = scores.argmax().item()
|
| 77 |
+
best_caption = candidates[best_idx]
|
| 78 |
+
best_score = scores[best_idx].item()
|
| 79 |
+
|
| 80 |
+
return best_caption[0], best_score # returning the caption text and score
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def compute_logprob(self, model, inputs, generated_ids, processor):
|
| 84 |
+
# Decode the generated tokens to text
|
| 85 |
+
caption_text = processor.decode(generated_ids[0], skip_special_tokens=True)
|
| 86 |
+
|
| 87 |
+
# Tokenize the caption (text) to get labels and input_ids
|
| 88 |
+
text_inputs = processor(text=caption_text, return_tensors="pt").to(self.device)
|
| 89 |
+
labels = text_inputs["input_ids"]
|
| 90 |
+
|
| 91 |
+
# Combine image inputs with the new input_ids (needed for loss computation)
|
| 92 |
+
model_inputs = {**inputs, "input_ids": text_inputs["input_ids"]}
|
| 93 |
+
|
| 94 |
+
# Compute the loss
|
| 95 |
+
with torch.no_grad():
|
| 96 |
+
outputs = model(**model_inputs, labels=labels)
|
| 97 |
+
|
| 98 |
+
return -outputs.loss.item() # Higher is better
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def get_best_caption(self, image):
|
| 103 |
+
# This runs BLIP and GIT, then scores both with CLIP to pick the best caption
|
| 104 |
+
caption, score = self.generate_caption_clip(image)
|
| 105 |
+
print(f"Selected Caption: {caption} | Confidence: {score}")
|
| 106 |
+
return caption, score
|
| 107 |
+
|
utils/llm_utils.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 2 |
+
import torch
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
class LLaMAHelper:
|
| 6 |
+
def __init__(self, hf_token=None):
|
| 7 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 8 |
+
self.model_id = "meta-llama/Llama-3.2-3B-Instruct"
|
| 9 |
+
|
| 10 |
+
hf_token = hf_token or os.getenv("HUGGINGFACE_TOKEN")
|
| 11 |
+
|
| 12 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, token=hf_token)
|
| 13 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 14 |
+
self.model_id,
|
| 15 |
+
token=hf_token,
|
| 16 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
|
| 17 |
+
).to(self.device)
|
| 18 |
+
|
| 19 |
+
self.pipe = pipeline(
|
| 20 |
+
"text-generation",
|
| 21 |
+
model=self.model,
|
| 22 |
+
tokenizer=self.tokenizer,
|
| 23 |
+
device=0 if torch.cuda.is_available() else -1
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# self.text_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
|
| 27 |
+
|
| 28 |
+
# def classifier(self, text, candidate_labels):
|
| 29 |
+
# return self.text_classifier(text, candidate_labels)
|
| 30 |
+
|
| 31 |
+
def chat(self, system_prompt, prompt, max_new_tokens=300, temperature=0.5):
|
| 32 |
+
messages = [
|
| 33 |
+
{"role": "system", "content": system_prompt},
|
| 34 |
+
{"role": "user", "content": prompt},
|
| 35 |
+
]
|
| 36 |
+
outputs = self.pipe(messages, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature)
|
| 37 |
+
if outputs:
|
| 38 |
+
if "content" in outputs[0]["generated_text"][-1]:
|
| 39 |
+
full_response = outputs[0]["generated_text"][-1]["content"].lower()
|
| 40 |
+
else:
|
| 41 |
+
full_response = outputs[0]["generated_text"][-1].lower()
|
| 42 |
+
|
| 43 |
+
print('response from LLM is', full_response)
|
| 44 |
+
return full_response.replace(prompt, "").strip()
|
| 45 |
+
|
utils/routing.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# agent_router.py
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
# Router to manage multi-agent classification
|
| 5 |
+
# agent_router.py
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
def route_agent(text, has_image):
|
| 9 |
+
"""
|
| 10 |
+
Determines which agent should handle the query based on image presence and content type.
|
| 11 |
+
"""
|
| 12 |
+
if has_image:
|
| 13 |
+
return "agent1" # Image-based input => Image issue agent
|
| 14 |
+
|
| 15 |
+
# Check for tenancy-related keywords
|
| 16 |
+
tenancy_keywords = [
|
| 17 |
+
"rent", "lease", "tenant", "landlord", "agreement", "deposit",
|
| 18 |
+
"eviction", "notice", "contract", "housing law", "tenancy", "sublet"
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
# Lowercase and look for keywords
|
| 22 |
+
if any(word in text.lower() for word in tenancy_keywords):
|
| 23 |
+
return "agent2"
|
| 24 |
+
|
| 25 |
+
# Fallback — you could route this to a clarification step instead
|
| 26 |
+
return "agent2"
|
| 27 |
+
|
| 28 |
+
def clarify_prompt():
|
| 29 |
+
return (
|
| 30 |
+
"Just to clarify, are you asking about a visible issue with a property (you can also upload an image),\n"
|
| 31 |
+
"or is this a general question about renting, laws, or agreements?"
|
| 32 |
+
)
|