Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -66,7 +66,8 @@ _SKILL_REGEX = re.compile(r"\b(Natural Language Processing|Building Information
|
|
| 66 |
|
| 67 |
def extract_skills_from_text(cv_text: str) -> List[str]:
|
| 68 |
skills = list({m.group(0).lower() for m in _SKILL_REGEX.finditer(cv_text)})
|
| 69 |
-
return [s.capitalize() for s in skills
|
|
|
|
| 70 |
|
| 71 |
# --- Process uploaded file (PDF, DOCX, TXT) ---
|
| 72 |
def process_uploaded_file(file_obj: Any) -> dict | None:
|
|
@@ -222,33 +223,110 @@ ensure_collections()
|
|
| 222 |
|
| 223 |
# -------------------- Query Weaviate --------------------
|
| 224 |
def query_weaviate_collection(class_name: str, query_text: str, limit: int = 5) -> List[dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
try:
|
| 226 |
collection = weaviate_client.collections.get(class_name)
|
| 227 |
|
| 228 |
-
#
|
| 229 |
-
response = collection.query.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
items = [obj.properties for obj in response.objects]
|
| 231 |
-
|
| 232 |
-
# fallback filter if nothing found
|
| 233 |
-
if not items:
|
| 234 |
-
filters = Filter.any_of([
|
| 235 |
-
Filter.by_property("title").like(f"*{query_text}*"),
|
| 236 |
-
Filter.by_property("description").like(f"*{query_text}*")
|
| 237 |
-
])
|
| 238 |
-
if class_name != "Team":
|
| 239 |
-
filters = Filter.any_of([
|
| 240 |
-
Filter.by_property("title").like(f"*{query_text}*"),
|
| 241 |
-
Filter.by_property("skills").like(f"*{query_text}*"),
|
| 242 |
-
Filter.by_property("description").like(f"*{query_text}*")
|
| 243 |
-
])
|
| 244 |
-
response_fallback = collection.query.fetch_objects(limit=limit, filters=filters)
|
| 245 |
-
items = [obj.properties for obj in response_fallback.objects]
|
| 246 |
-
|
| 247 |
return items
|
|
|
|
| 248 |
except Exception as e:
|
| 249 |
print(f"[Weaviate Query Error] {e}")
|
| 250 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
# -------------------- RAG Prompt Builder --------------------
|
| 253 |
def build_rag_prompt(user_question: str, retrieved_items: List[dict], class_name: str) -> str:
|
| 254 |
context_parts = []
|
|
@@ -811,6 +889,7 @@ with gr.Blocks(css="""
|
|
| 811 |
with gr.Row():
|
| 812 |
clear_btn = gr.Button("Reset Conversation")
|
| 813 |
instructions = gr.Markdown("Commands: `apply`, `create team`, `join team`, `recommend` — the bot will guide you step-by-step.")
|
|
|
|
| 814 |
|
| 815 |
# persistent state across turns
|
| 816 |
chat_history_state = gr.State([])
|
|
|
|
| 66 |
|
| 67 |
def extract_skills_from_text(cv_text: str) -> List[str]:
|
| 68 |
skills = list({m.group(0).lower() for m in _SKILL_REGEX.finditer(cv_text)})
|
| 69 |
+
return [s.capitalize() for s in skills
|
| 70 |
+
|
| 71 |
|
| 72 |
# --- Process uploaded file (PDF, DOCX, TXT) ---
|
| 73 |
def process_uploaded_file(file_obj: Any) -> dict | None:
|
|
|
|
| 223 |
|
| 224 |
# -------------------- Query Weaviate --------------------
|
| 225 |
def query_weaviate_collection(class_name: str, query_text: str, limit: int = 5) -> List[dict]:
|
| 226 |
+
"""
|
| 227 |
+
Performs a hybrid search on a Weaviate collection to get more relevant results
|
| 228 |
+
for conversational queries.
|
| 229 |
+
"""
|
| 230 |
try:
|
| 231 |
collection = weaviate_client.collections.get(class_name)
|
| 232 |
|
| 233 |
+
# Use hybrid search: combines vector (semantic) and keyword (BM25) search
|
| 234 |
+
response = collection.query.hybrid(
|
| 235 |
+
query=query_text,
|
| 236 |
+
limit=limit,
|
| 237 |
+
# For job searches, prioritize matching the title
|
| 238 |
+
query_properties=["title^2", "description", "skills"] if class_name == "Job" else None
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
items = [obj.properties for obj in response.objects]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
return items
|
| 243 |
+
|
| 244 |
except Exception as e:
|
| 245 |
print(f"[Weaviate Query Error] {e}")
|
| 246 |
return []
|
| 247 |
+
# -------------------- NEW: Search All Collections --------------------
|
| 248 |
+
# -------------------- RAG Answer (Modified for Multi-Class Search) --------------------
|
| 249 |
+
def rag_answer_all(user_question: str, top_k: int = 3) -> (str, list[dict]):
|
| 250 |
+
# Step 1: Search across all relevant collections
|
| 251 |
+
retrieved_items = search_all_collections(user_question, limit_per_class=top_k)
|
| 252 |
+
|
| 253 |
+
if not retrieved_items:
|
| 254 |
+
return f"Sorry, I couldn't find any results related to '{user_question}' in our Jobs, Projects, or Opportunities databases.", []
|
| 255 |
+
|
| 256 |
+
# Step 2: Build a new prompt that handles multiple sources
|
| 257 |
+
context_parts = []
|
| 258 |
+
# Group results by class for clearer presentation in the prompt
|
| 259 |
+
grouped_results = {}
|
| 260 |
+
for item in retrieved_items:
|
| 261 |
+
class_name = item["class_name"]
|
| 262 |
+
if class_name not in grouped_results:
|
| 263 |
+
grouped_results[class_name] = []
|
| 264 |
+
grouped_results[class_name].append(item["properties"])
|
| 265 |
+
|
| 266 |
+
for class_name, items in grouped_results.items():
|
| 267 |
+
context_parts.append(f"\n--- Results from '{class_name}' collection ---")
|
| 268 |
+
for i, properties in enumerate(items, 1):
|
| 269 |
+
details = {k: str(v) for k, v in properties.items()}
|
| 270 |
+
item_str = f"Record {i}:\n{json.dumps(details, indent=2, ensure_ascii=False)}"
|
| 271 |
+
context_parts.append(item_str)
|
| 272 |
+
|
| 273 |
+
context_block = "\n".join(context_parts)
|
| 274 |
|
| 275 |
+
prompt = f"""
|
| 276 |
+
User Question: "{user_question}"
|
| 277 |
+
|
| 278 |
+
You are an expert AI assistant. Your mission is to analyze structured data from different categories (Jobs, Projects, Opportunities) and present a comprehensive, clear summary to the user.
|
| 279 |
+
|
| 280 |
+
**Primary Directive:** Your ONLY source of information is the structured JSON data provided below under "Retrieved Data". If the data section is empty, state that no results were found.
|
| 281 |
+
|
| 282 |
+
**Your Core Instructions:**
|
| 283 |
+
1. **Acknowledge the Categories:** Analyze all the data provided from each collection (`Job`, `Project`, `Opportunities`).
|
| 284 |
+
2. **Summarize Logically:** For each result, **you must clearly state which category it belongs to**. For example, start with "I found a **Job** opportunity:" or "Here is a **Project** you might be interested in:".
|
| 285 |
+
3. **Present All Details:** Convert the data for each item into natural, readable language, covering all important details like title, company/creator, description, and skills.
|
| 286 |
+
4. **Use Clear Formatting:** Use Markdown headings (e.g., `### Job: [Title]`) and bullet points to make the response easy to read.
|
| 287 |
+
|
| 288 |
+
Retrieved Data:
|
| 289 |
+
{context_block}
|
| 290 |
+
"""
|
| 291 |
+
|
| 292 |
+
# Step 3: Call the LLM to get the final answer
|
| 293 |
+
try:
|
| 294 |
+
resp = llm_client.chat.completions.create(
|
| 295 |
+
model=MODEL_NAME,
|
| 296 |
+
messages=[
|
| 297 |
+
{"role": "system", "content": SYSTEM_PROMPT_BASE},
|
| 298 |
+
{"role": "user", "content": prompt}
|
| 299 |
+
],
|
| 300 |
+
temperature=0.3,
|
| 301 |
+
max_tokens=4096
|
| 302 |
+
)
|
| 303 |
+
answer = resp.choices[0].message.content or ""
|
| 304 |
+
except Exception as e:
|
| 305 |
+
print(f"[RAG LLM Error] {e}")
|
| 306 |
+
answer = "⚠️ Sorry, I couldn't process that. Try again later."
|
| 307 |
+
|
| 308 |
+
return answer, retrieved_items
|
| 309 |
+
|
| 310 |
+
# ========== IDLE STATE ==========
|
| 311 |
+
if st == "idle":
|
| 312 |
+
low = text.lower()
|
| 313 |
+
|
| 314 |
+
# ... (The first parts for greetings and flow starters remain the same)
|
| 315 |
+
# ... (e.g., if any(k in low for k in ["apply",...])
|
| 316 |
+
# ... (e.g., if any(k in low for k in ["team",...])
|
| 317 |
+
# ... (e.g., if any(k in low for k in ["recommend",...])
|
| 318 |
+
|
| 319 |
+
# 3) Check for specific Knowledge Base intents
|
| 320 |
+
intent = route_intent(text)
|
| 321 |
+
if intent and intent.startswith("kb_"):
|
| 322 |
+
kb_ans = kb_fallback(intent)
|
| 323 |
+
if kb_ans:
|
| 324 |
+
return kb_ans, session, False
|
| 325 |
+
|
| 326 |
+
# 4) If it's not a command or KB question, perform a global RAG search
|
| 327 |
+
# This is now the default action for any general query.
|
| 328 |
+
rag_ans, _ = rag_answer_all(text)
|
| 329 |
+
return rag_ans, session, False
|
| 330 |
# -------------------- RAG Prompt Builder --------------------
|
| 331 |
def build_rag_prompt(user_question: str, retrieved_items: List[dict], class_name: str) -> str:
|
| 332 |
context_parts = []
|
|
|
|
| 889 |
with gr.Row():
|
| 890 |
clear_btn = gr.Button("Reset Conversation")
|
| 891 |
instructions = gr.Markdown("Commands: `apply`, `create team`, `join team`, `recommend` — the bot will guide you step-by-step.")
|
| 892 |
+
|
| 893 |
|
| 894 |
# persistent state across turns
|
| 895 |
chat_history_state = gr.State([])
|