afouda commited on
Commit
e01009a
·
verified ·
1 Parent(s): 4cafee6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +877 -247
app.py CHANGED
@@ -1,225 +1,749 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
  import os
3
  import json
4
  import time
 
5
  import uuid
 
6
  from dataclasses import dataclass
7
- from typing import List, Dict, Any
8
 
9
- # --- Gradio & UI ---
10
  import markdown
11
  import gradio as gr
12
-
13
- # --- LLM & Vector DB ---
14
  from openai import OpenAI
15
- import weaviate
16
-
17
- # --- File Processing ---
18
  import fitz # PyMuPDF
19
  import docx
 
 
 
20
 
21
- # --- 1. BACKEND LOGIC & CONFIG ---
 
 
 
22
 
23
- # --- LLM Configuration (DeepInfra) ---
24
- MODEL_NAME = "openai/gpt-oss-120b"
25
- # تأكد من وضع مفتاح API الخاص بك هنا
26
- DEEPINFRA_API_KEY = os.getenv("DEEPINFRA_API_KEY", "kPEm10rrnxXrCf0TuB6Xcd7Y7lp3YgKa")
27
- BASE_URL = "https://api.deepinfra.com/v1/openai"
28
 
29
- if not DEEPINFRA_API_KEY:
30
- print("[WARN] DEEPINFRA_API_KEY is not set. The chatbot will likely fail.")
31
 
 
 
32
  llm_client = OpenAI(api_key=DEEPINFRA_API_KEY, base_url=BASE_URL)
33
 
34
- # --- Weaviate RAG Configuration ---
35
- WEAVIATE_URL = os.getenv("WEAVIATE_URL", "https://org-bgpt4w63nvf1yeuw.c0.us-west3.gcp.weaviate.cloud")
36
- # تأكد من وضع مفتاح API الخاص بك هنا
37
- WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "ZUd6clB5WmYzVGkxeU40cl96NTY5UkViUlVzY05Md3IzQ0JKelBZQmxGZHRPeGpCeGdxS1FUNnlYUkFFPV92MjAw")
38
-
39
- try:
40
- weaviate_client = weaviate.Client(
41
- url=WEAVIATE_URL,
42
- auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
43
- )
44
- print("[INFO] Successfully connected to Weaviate.")
45
- except Exception as e:
46
- print(f"[ERROR] Failed to connect to Weaviate: {e}")
47
- weaviate_client = None
48
-
49
- # --- Language & Routing Configuration ---
50
- ARABIC_RANGE = (
51
- (0x0600, 0x06FF), (0x0750, 0x077F), (0x08A0, 0x08FF),
52
- (0xFB50, 0xFDFF), (0xFE70, 0xFEFF), (0x1EE00, 0x1EEFF)
53
  )
54
 
55
- @dataclass
56
- class Route:
57
- audience: str
58
- intent: str
59
- language: str
60
-
61
- # --- Knowledge Base (KB) for simple queries ---
62
  KB: Dict[str, Dict[str, str]] = {
63
  "student_registration": {
64
- "en": "**How to register (Student)**\n\n1. Go to the EduNatives site and choose Sign Up.\n2. Use your university email and verify it.\n3. Complete your profile.",
65
- "ar": "**طريقة التسجيل (طلاب)**\n\n١. اذهب إلى موقع EduNatives واختر Sign Up.\n٢. استخدم إيميل الجامعة وأكده.\n٣. أكمل ملفك الشخصي.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  },
67
  }
68
 
69
- # --- UPDATED: Keywords for intent routing, including RAG intents ---
70
  KEYS = {
71
- # RAG Intents
72
- "find_job": ["job", "jobs", "career", "hiring", "وظيفة", "وظائف", "توظيف", "شغل"],
73
- "find_opportunity": ["intern", "internship", "scholarship", "opportunity", "training", "تدريب", "منحة", "فرصة"],
74
- "find_project": ["project", "projects", "research", "مشروع", "مشاريع", "بحث", "ابحاث"],
75
- "join_team": ["team", "join team", "find team", "فريق", "انضم لفريق", "تيم"],
76
-
77
- # Standard Intents
78
- "student_registration": ["register", "sign up", "account", "تسجيل", "حساب"],
79
- "student_mentors": ["mentor", "advisor", "professor", "مشرف", "دكتور"],
80
- "university_publish": ["publish", "paper", "conference", "نشر", "مؤتمر"],
81
  }
82
 
83
- # --- Mapping intents to audiences ---
84
  AUDIENCE_MAP = {
85
- "find_job": "student",
86
- "find_opportunity": "student",
87
- "find_project": "student",
88
- "join_team": "student",
89
  "student_registration": "student",
 
90
  "student_mentors": "student",
91
  "university_publish": "university",
 
 
 
 
 
 
92
  }
93
 
94
- # --- System Prompts ---
95
  SYSTEM_PROMPT_BASE = (
96
- "You are **EduNatives Assistant**, a helpful, friendly, and precise academic/career guide for Students, Universities, and Companies. "
97
- "Reply in the user's language (Arabic/English). Be concise and action-oriented."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  )
99
 
100
- RAG_PROMPT_TEMPLATE = (
101
- "Based on the following information retrieved from our database, please answer the user's question. "
102
- "Format the results clearly (e.g., using a list or table). At the end, ask the user if they need help applying or have more questions.\n\n"
103
- "--- RETRIEVED DATA ---\n{retrieved_data}\n--- END DATA ---\n\n"
104
  )
105
 
106
- # --- 2. CORE FUNCTIONS ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- def log_interaction(data: Dict[str, Any]):
109
- """Appends interaction data to a JSONL file for analytics."""
110
- with open("interaction_log.jsonl", "a", encoding="utf-8") as f:
111
- f.write(json.dumps(data, ensure_ascii=False) + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- def process_uploaded_file(file_obj: Any) -> str | None:
114
- """Extracts text from an uploaded file object."""
115
- if file_obj is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  return None
117
  file_path = file_obj.name
118
- file_name = os.path.basename(file_path)
119
  text_content = ""
120
  try:
121
- if file_name.lower().endswith(".pdf"):
122
  with fitz.open(file_path) as doc:
123
- text_content = "".join(page.get_text() for page in doc)
124
- elif file_name.lower().endswith(".docx"):
 
125
  doc = docx.Document(file_path)
126
- text_content = "\n".join(para.text for para in doc.paragraphs)
127
- elif file_name.lower().endswith(".txt"):
 
128
  with open(file_path, "r", encoding="utf-8") as f:
129
  text_content = f.read()
130
  else:
131
- return f"[Unsupported file type: {file_name}]"
132
- return text_content.strip()
 
133
  except Exception as e:
134
- print(f"[ERROR] Failed to process file {file_name}: {e}")
135
- return f"[Error processing file: {file_name}]"
136
 
137
- def is_arabic(text: str) -> bool:
138
- """Checks if a string contains Arabic characters."""
139
- return any(any(a <= ord(ch) <= b for a, b in ARABIC_RANGE) for ch in text)
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- def route_intent(text: str, has_file: bool, forced_audience: str | None = None) -> Route:
142
- """Determines user intent based on keywords and context."""
143
- lang = "ar" if is_arabic(text) else "en"
144
- text_l = text.lower()
145
-
146
- # Special intent for CV analysis
147
- if has_file and any(kw in text_l for kw in ["cv", "resume", "my skills", "سيرة ذاتية", "ملفي"]):
148
- return Route(audience="student", intent="analyze_cv_for_opportunities", language=lang)
149
 
150
- # Keyword-based routing
 
 
151
  match_label = None
152
  for label, kws in KEYS.items():
153
- if any(kw in text_l for kw in kws):
154
- match_label = label
 
 
 
155
  break
156
-
157
- if match_label:
158
- audience = AUDIENCE_MAP.get(match_label, "general")
159
- if forced_audience:
160
- audience = forced_audience
161
- return Route(audience=audience, intent=match_label, language=lang)
162
-
163
- # Fallback to general intent
164
- return Route(audience=forced_audience or "general", intent="general", language=lang)
165
-
166
- def call_llm(user_message: str, history: List[Dict[str, str]], system_prompt: str) -> str:
167
- """Generic function to call the LLM."""
168
- messages: List[Dict[str, str]] = [{"role": "system", "content": system_prompt}]
169
- messages.extend(history[-6:]) # Keep last 3 turns
 
170
  messages.append({"role": "user", "content": user_message})
171
-
172
  try:
173
  resp = llm_client.chat.completions.create(
174
- model=MODEL_NAME, messages=messages, temperature=0.6, top_p=0.9, max_tokens=4096,
 
 
 
175
  )
176
  return resp.choices[0].message.content or ""
177
  except Exception as e:
178
- print(f"[ERROR] LLM call failed: {e}")
179
- return "Sorry, I'm having trouble connecting to my brain right now. Please try again later."
180
-
181
- def query_weaviate(class_name: str, query_text: str, properties: List[str], limit: int = 5) -> str:
182
- """Performs vector search on a Weaviate collection and formats the output."""
183
- if not weaviate_client:
184
- return "Database connection is not available."
185
- try:
186
- response = (
187
- weaviate_client.query
188
- .get(class_name, properties)
189
- .with_near_text({"concepts": [query_text]})
190
- .with_limit(limit)
191
- .do()
192
- )
193
-
194
- results = response["data"]["Get"][class_name]
195
- if not results:
196
- return f"No matching {class_name.lower()} found."
197
-
198
- formatted_output = ""
199
- for i, item in enumerate(results):
200
- formatted_output += f"### Result {i+1}\n"
201
- for prop in properties:
202
- if prop in item and item[prop]:
203
- formatted_output += f"- **{prop.replace('_', ' ').title()}**: {item[prop]}\n"
204
- formatted_output += "\n"
205
- return formatted_output.strip()
206
-
207
- except Exception as e:
208
- print(f"[ERROR] Weaviate query failed for class '{class_name}': {e}")
209
- return f"An error occurred while searching for {class_name.lower()}."
210
-
211
- def analyze_cv_with_llm(cv_text: str) -> str:
212
- """Uses LLM to extract key skills and information from a CV."""
213
- prompt = (
214
- "Analyze the following CV text and extract the key information. "
215
- "Summarize it into a short phrase suitable for a vector search to find matching jobs or internships. "
216
- "Focus on technical skills, programming languages, field of study, and key experiences.\n\n"
217
- f"--- CV TEXT ---\n{cv_text}\n--- END CV TEXT ---"
218
- )
219
- return call_llm(prompt, [], SYSTEM_PROMPT_BASE)
220
-
221
- # --- 3. GRADIO UI & EVENT HANDLERS ---
222
 
 
223
  with gr.Blocks(css="""
224
  .chatbot {height: 500px; overflow: auto;}
225
  .user-bubble {background-color: #DCF8C6; padding: 10px; border-radius: 12px; max-width: 75%; float: right; clear: both; margin: 5px; word-wrap: break-word;}
@@ -229,116 +753,222 @@ with gr.Blocks(css="""
229
  .bot-bubble th, .bot-bubble td {border: 1px solid #ddd; padding: 8px; text-align: left;}
230
  .bot-bubble th {background-color: #e9e9e9;}
231
  """) as demo:
232
- gr.Markdown("# 🤖 EduNatives Assistant\nYour smart, bilingual guide for academic and career opportunities.")
 
233
 
234
  with gr.Row():
235
- audience_dd = gr.Dropdown(
236
- label="Audience",
237
- choices=["Auto", "Student", "University-Research", "Company"],
238
- value="Auto",
239
- interactive=True,
240
- info="Select your role. 'Auto' detects it from your message."
241
- )
242
  clear_btn = gr.Button("🧹 Clear Chat")
243
 
244
  status = gr.Markdown("Status: Ready.")
245
  chatbot_html = gr.HTML("<div class='chatbot' id='chatbot'></div>")
246
  chat_history_state = gr.State([])
 
247
 
248
  with gr.Row(elem_classes="chatbox-container"):
249
- msg = gr.Textbox(
250
- placeholder="اكتب سؤالك هنا... / Ask your question here...",
251
- lines=2, scale=4, autofocus=True,
252
- )
253
- file_uploader = gr.File(
254
- label="Upload Document (.txt, .pdf, .docx)",
255
- file_types=[".txt", ".pdf", ".docx"],
256
- interactive=True,
257
- )
258
  with gr.Column(scale=1, min_width=120):
259
  send_btn = gr.Button("➡️ Send", scale=1, variant="primary")
260
 
261
- def format_chat_html(history: List[Dict[str, str]]) -> str:
262
- """Converts chat history to styled HTML."""
263
- html = "<div class='chatbot'>"
264
- for message in history:
265
- role, content = message["role"], message["content"]
266
- bubble_class = "user-bubble" if role == "user" else "bot-bubble"
267
- html_content = markdown.markdown(content, extensions=['tables']) if role == "assistant" else content
268
- html += f"<div class='{bubble_class}'>{html_content}</div>"
269
- html += "</div>"
270
- return html
271
-
272
- def respond(user_text: str, file_obj: Any, history: List[Dict[str, str]], audience_choice: str):
273
- # 1. Process inputs
274
- document_text = process_uploaded_file(file_obj)
275
- if not user_text.strip() and not document_text:
276
- return "", format_chat_html(history), history, "Status: Please type a message or upload a file.", None
277
 
278
- user_message_for_history = user_text
279
- if document_text:
280
- file_name = os.path.basename(file_obj.name)
281
- user_message_for_history += f"\n\n*📎 [File Attached: {file_name}]*"
282
-
283
- # 2. Route intent
284
  forced = {"Student": "student", "University-Research": "university", "Company": "company"}.get(audience_choice)
285
- route = route_intent(user_text, has_file=bool(document_text), forced_audience=forced)
286
  status_text = f"**Audience**: {route.audience} | **Intent**: {route.intent} | **Lang**: {route.language.upper()}"
287
-
288
- # 3. Handle different intents
289
- answer = ""
290
- rag_query_text = user_text
291
-
292
- # --- RAG Logic Branch ---
293
- if weaviate_client and route.intent in ["find_job", "find_opportunity", "find_project", "analyze_cv_for_opportunities"]:
294
- if route.intent == "analyze_cv_for_opportunities":
295
- status_text += " | Analyzing CV..."
296
- # Use LLM to get a search query from the CV
297
- rag_query_text = analyze_cv_with_llm(document_text)
298
-
299
- if "job" in route.intent or "cv" in route.intent:
300
- retrieved_data = query_weaviate("Job", rag_query_text, ["title", "company", "location", "description"])
301
- elif "opportunity" in route.intent:
302
- retrieved_data = query_weaviate("Opportunities", rag_query_text, ["title", "type", "organization", "summary"])
303
- elif "project" in route.intent:
304
- retrieved_data = query_weaviate("Project", rag_query_text, ["title", "field", "university", "abstract"])
305
-
306
- # Combine retrieved data with LLM for a natural response
307
- system_prompt = RAG_PROMPT_TEMPLATE.format(retrieved_data=retrieved_data)
308
- answer = call_llm(user_text, history, system_prompt)
309
 
310
- # --- KB/General LLM Logic Branch ---
311
- else:
312
- if route.intent in KB:
313
- answer = KB[route.intent].get(route.language, KB[route.intent]["en"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  else:
315
- answer = call_llm(user_text, history, SYSTEM_PROMPT_BASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
- # 4. Update history, log, and format for UI
318
  history.append({"role": "user", "content": user_message_for_history})
319
- history.append({"role": "assistant", "content": answer})
320
-
321
- log_interaction({
322
- "timestamp": time.time(),
323
- "user_message": user_text,
324
- "file_uploaded": file_obj.name if file_obj else None,
325
- "audience": route.audience,
326
- "intent": route.intent,
327
- "language": route.language,
328
- "bot_response": answer
329
- })
330
-
331
- updated_html = format_chat_html(history)
332
- return "", updated_html, history, status_text, None # Clear text input and file uploader
333
 
334
- def clear_chat():
335
- """Clears the chat history and UI components."""
336
- return "", [], "Status: Ready.", None
 
 
337
 
338
- # Event Handlers
339
- send_btn.click(respond, inputs=[msg, file_uploader, chat_history_state, audience_dd], outputs=[msg, chatbot_html, chat_history_state, status, file_uploader], queue=True)
340
- msg.submit(respond, inputs=[msg, file_uploader, chat_history_state, audience_dd], outputs=[msg, chatbot_html, chat_history_state, status, file_uploader], queue=True)
341
- clear_btn.click(clear_chat, outputs=[chatbot_html, chat_history_state, status, file_uploader], queue=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
  if __name__ == "__main__":
344
  demo.launch(debug=True)
 
1
+ # from __future__ import annotations
2
+ # import os
3
+ # import json
4
+ # import time
5
+ # import uuid
6
+ # from dataclasses import dataclass
7
+ # from typing import List, Dict, Any
8
+
9
+ # # --- Gradio & UI ---
10
+ # import markdown
11
+ # import gradio as gr
12
+
13
+ # # --- LLM & Vector DB ---
14
+ # from openai import OpenAI
15
+ # import weaviate
16
+
17
+ # # --- File Processing ---
18
+ # import fitz # PyMuPDF
19
+ # import docx
20
+
21
+ # # --- 1. BACKEND LOGIC & CONFIG ---
22
+
23
+ # # --- LLM Configuration (DeepInfra) ---
24
+ # MODEL_NAME = "openai/gpt-oss-120b"
25
+ # # تأكد من وضع مفتاح API الخاص بك هنا
26
+ # DEEPINFRA_API_KEY = os.getenv("DEEPINFRA_API_KEY", "kPEm10rrnxXrCf0TuB6Xcd7Y7lp3YgKa")
27
+ # BASE_URL = "https://api.deepinfra.com/v1/openai"
28
+
29
+ # if not DEEPINFRA_API_KEY:
30
+ # print("[WARN] DEEPINFRA_API_KEY is not set. The chatbot will likely fail.")
31
+
32
+ # llm_client = OpenAI(api_key=DEEPINFRA_API_KEY, base_url=BASE_URL)
33
+
34
+ # # --- Weaviate RAG Configuration ---
35
+ # WEAVIATE_URL = os.getenv("WEAVIATE_URL", "https://org-bgpt4w63nvf1yeuw.c0.us-west3.gcp.weaviate.cloud")
36
+ # # تأكد من وضع مفتاح API الخاص بك هنا
37
+ # WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "ZUd6clB5WmYzVGkxeU40cl96NTY5UkViUlVzY05Md3IzQ0JKelBZQmxGZHRPeGpCeGdxS1FUNnlYUkFFPV92MjAw")
38
+
39
+ # try:
40
+ # weaviate_client = weaviate.Client(
41
+ # url=WEAVIATE_URL,
42
+ # auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
43
+ # )
44
+ # print("[INFO] Successfully connected to Weaviate.")
45
+ # except Exception as e:
46
+ # print(f"[ERROR] Failed to connect to Weaviate: {e}")
47
+ # weaviate_client = None
48
+
49
+ # # --- Language & Routing Configuration ---
50
+ # ARABIC_RANGE = (
51
+ # (0x0600, 0x06FF), (0x0750, 0x077F), (0x08A0, 0x08FF),
52
+ # (0xFB50, 0xFDFF), (0xFE70, 0xFEFF), (0x1EE00, 0x1EEFF)
53
+ # )
54
+
55
+ # @dataclass
56
+ # class Route:
57
+ # audience: str
58
+ # intent: str
59
+ # language: str
60
+
61
+ # # --- Knowledge Base (KB) for simple queries ---
62
+ # KB: Dict[str, Dict[str, str]] = {
63
+ # "student_registration": {
64
+ # "en": "**How to register (Student)**\n\n1. Go to the EduNatives site and choose Sign Up.\n2. Use your university email and verify it.\n3. Complete your profile.",
65
+ # "ar": "**طريقة التسجيل (طلاب)**\n\n١. اذهب إلى موقع EduNatives واختر Sign Up.\n٢. استخدم إيميل الجامعة وأكده.\n٣. أكمل ملفك الشخصي.",
66
+ # },
67
+ # }
68
+
69
+ # # --- UPDATED: Keywords for intent routing, including RAG intents ---
70
+ # KEYS = {
71
+ # # RAG Intents
72
+ # "find_job": ["job", "jobs", "career", "hiring", "وظيفة", "وظائف", "توظيف", "شغل"],
73
+ # "find_opportunity": ["intern", "internship", "scholarship", "opportunity", "training", "تدريب", "منحة", "فرصة"],
74
+ # "find_project": ["project", "projects", "research", "مشروع", "مشاريع", "بحث", "ابحاث"],
75
+ # "join_team": ["team", "join team", "find team", "فريق", "انضم لفريق", "تيم"],
76
+
77
+ # # Standard Intents
78
+ # "student_registration": ["register", "sign up", "account", "تسجيل", "حساب"],
79
+ # "student_mentors": ["mentor", "advisor", "professor", "مشرف", "دكتور"],
80
+ # "university_publish": ["publish", "paper", "conference", "نشر", "مؤتمر"],
81
+ # }
82
+
83
+ # # --- Mapping intents to audiences ---
84
+ # AUDIENCE_MAP = {
85
+ # "find_job": "student",
86
+ # "find_opportunity": "student",
87
+ # "find_project": "student",
88
+ # "join_team": "student",
89
+ # "student_registration": "student",
90
+ # "student_mentors": "student",
91
+ # "university_publish": "university",
92
+ # }
93
+
94
+ # # --- System Prompts ---
95
+ # SYSTEM_PROMPT_BASE = (
96
+ # "You are **EduNatives Assistant**, a helpful, friendly, and precise academic/career guide for Students, Universities, and Companies. "
97
+ # "Reply in the user's language (Arabic/English). Be concise and action-oriented."
98
+ # )
99
+
100
+ # RAG_PROMPT_TEMPLATE = (
101
+ # "Based on the following information retrieved from our database, please answer the user's question. "
102
+ # "Format the results clearly (e.g., using a list or table). At the end, ask the user if they need help applying or have more questions.\n\n"
103
+ # "--- RETRIEVED DATA ---\n{retrieved_data}\n--- END DATA ---\n\n"
104
+ # )
105
+
106
+ # # --- 2. CORE FUNCTIONS ---
107
+
108
+ # def log_interaction(data: Dict[str, Any]):
109
+ # """Appends interaction data to a JSONL file for analytics."""
110
+ # with open("interaction_log.jsonl", "a", encoding="utf-8") as f:
111
+ # f.write(json.dumps(data, ensure_ascii=False) + "\n")
112
+
113
+ # def process_uploaded_file(file_obj: Any) -> str | None:
114
+ # """Extracts text from an uploaded file object."""
115
+ # if file_obj is None:
116
+ # return None
117
+ # file_path = file_obj.name
118
+ # file_name = os.path.basename(file_path)
119
+ # text_content = ""
120
+ # try:
121
+ # if file_name.lower().endswith(".pdf"):
122
+ # with fitz.open(file_path) as doc:
123
+ # text_content = "".join(page.get_text() for page in doc)
124
+ # elif file_name.lower().endswith(".docx"):
125
+ # doc = docx.Document(file_path)
126
+ # text_content = "\n".join(para.text for para in doc.paragraphs)
127
+ # elif file_name.lower().endswith(".txt"):
128
+ # with open(file_path, "r", encoding="utf-8") as f:
129
+ # text_content = f.read()
130
+ # else:
131
+ # return f"[Unsupported file type: {file_name}]"
132
+ # return text_content.strip()
133
+ # except Exception as e:
134
+ # print(f"[ERROR] Failed to process file {file_name}: {e}")
135
+ # return f"[Error processing file: {file_name}]"
136
+
137
+ # def is_arabic(text: str) -> bool:
138
+ # """Checks if a string contains Arabic characters."""
139
+ # return any(any(a <= ord(ch) <= b for a, b in ARABIC_RANGE) for ch in text)
140
+
141
+ # def route_intent(text: str, has_file: bool, forced_audience: str | None = None) -> Route:
142
+ # """Determines user intent based on keywords and context."""
143
+ # lang = "ar" if is_arabic(text) else "en"
144
+ # text_l = text.lower()
145
+
146
+ # # Special intent for CV analysis
147
+ # if has_file and any(kw in text_l for kw in ["cv", "resume", "my skills", "سيرة ذاتية", "ملفي"]):
148
+ # return Route(audience="student", intent="analyze_cv_for_opportunities", language=lang)
149
+
150
+ # # Keyword-based routing
151
+ # match_label = None
152
+ # for label, kws in KEYS.items():
153
+ # if any(kw in text_l for kw in kws):
154
+ # match_label = label
155
+ # break
156
+
157
+ # if match_label:
158
+ # audience = AUDIENCE_MAP.get(match_label, "general")
159
+ # if forced_audience:
160
+ # audience = forced_audience
161
+ # return Route(audience=audience, intent=match_label, language=lang)
162
+
163
+ # # Fallback to general intent
164
+ # return Route(audience=forced_audience or "general", intent="general", language=lang)
165
+
166
+ # def call_llm(user_message: str, history: List[Dict[str, str]], system_prompt: str) -> str:
167
+ # """Generic function to call the LLM."""
168
+ # messages: List[Dict[str, str]] = [{"role": "system", "content": system_prompt}]
169
+ # messages.extend(history[-6:]) # Keep last 3 turns
170
+ # messages.append({"role": "user", "content": user_message})
171
+
172
+ # try:
173
+ # resp = llm_client.chat.completions.create(
174
+ # model=MODEL_NAME, messages=messages, temperature=0.6, top_p=0.9, max_tokens=4096,
175
+ # )
176
+ # return resp.choices[0].message.content or ""
177
+ # except Exception as e:
178
+ # print(f"[ERROR] LLM call failed: {e}")
179
+ # return "Sorry, I'm having trouble connecting to my brain right now. Please try again later."
180
+
181
+ # def query_weaviate(class_name: str, query_text: str, properties: List[str], limit: int = 5) -> str:
182
+ # """Performs vector search on a Weaviate collection and formats the output."""
183
+ # if not weaviate_client:
184
+ # return "Database connection is not available."
185
+ # try:
186
+ # response = (
187
+ # weaviate_client.query
188
+ # .get(class_name, properties)
189
+ # .with_near_text({"concepts": [query_text]})
190
+ # .with_limit(limit)
191
+ # .do()
192
+ # )
193
+
194
+ # results = response["data"]["Get"][class_name]
195
+ # if not results:
196
+ # return f"No matching {class_name.lower()} found."
197
+
198
+ # formatted_output = ""
199
+ # for i, item in enumerate(results):
200
+ # formatted_output += f"### Result {i+1}\n"
201
+ # for prop in properties:
202
+ # if prop in item and item[prop]:
203
+ # formatted_output += f"- **{prop.replace('_', ' ').title()}**: {item[prop]}\n"
204
+ # formatted_output += "\n"
205
+ # return formatted_output.strip()
206
+
207
+ # except Exception as e:
208
+ # print(f"[ERROR] Weaviate query failed for class '{class_name}': {e}")
209
+ # return f"An error occurred while searching for {class_name.lower()}."
210
+
211
+ # def analyze_cv_with_llm(cv_text: str) -> str:
212
+ # """Uses LLM to extract key skills and information from a CV."""
213
+ # prompt = (
214
+ # "Analyze the following CV text and extract the key information. "
215
+ # "Summarize it into a short phrase suitable for a vector search to find matching jobs or internships. "
216
+ # "Focus on technical skills, programming languages, field of study, and key experiences.\n\n"
217
+ # f"--- CV TEXT ---\n{cv_text}\n--- END CV TEXT ---"
218
+ # )
219
+ # return call_llm(prompt, [], SYSTEM_PROMPT_BASE)
220
+
221
+ # # --- 3. GRADIO UI & EVENT HANDLERS ---
222
+
223
+ # with gr.Blocks(css="""
224
+ # .chatbot {height: 500px; overflow: auto;}
225
+ # .user-bubble {background-color: #DCF8C6; padding: 10px; border-radius: 12px; max-width: 75%; float: right; clear: both; margin: 5px; word-wrap: break-word;}
226
+ # .bot-bubble {background-color: #F1F0F0; padding: 10px; border-radius: 12px; max-width: 75%; float: left; clear: both; margin: 5px; word-wrap: break-word;}
227
+ # .chatbox-container {display: flex; gap: 8px; margin-top: 10px;}
228
+ # .bot-bubble table {border-collapse: collapse; width: 100%;}
229
+ # .bot-bubble th, .bot-bubble td {border: 1px solid #ddd; padding: 8px; text-align: left;}
230
+ # .bot-bubble th {background-color: #e9e9e9;}
231
+ # """) as demo:
232
+ # gr.Markdown("# 🤖 EduNatives Assistant\nYour smart, bilingual guide for academic and career opportunities.")
233
+
234
+ # with gr.Row():
235
+ # audience_dd = gr.Dropdown(
236
+ # label="Audience",
237
+ # choices=["Auto", "Student", "University-Research", "Company"],
238
+ # value="Auto",
239
+ # interactive=True,
240
+ # info="Select your role. 'Auto' detects it from your message."
241
+ # )
242
+ # clear_btn = gr.Button("🧹 Clear Chat")
243
+
244
+ # status = gr.Markdown("Status: Ready.")
245
+ # chatbot_html = gr.HTML("<div class='chatbot' id='chatbot'></div>")
246
+ # chat_history_state = gr.State([])
247
+
248
+ # with gr.Row(elem_classes="chatbox-container"):
249
+ # msg = gr.Textbox(
250
+ # placeholder="اكتب سؤالك هنا... / Ask your question here...",
251
+ # lines=2, scale=4, autofocus=True,
252
+ # )
253
+ # file_uploader = gr.File(
254
+ # label="Upload Document (.txt, .pdf, .docx)",
255
+ # file_types=[".txt", ".pdf", ".docx"],
256
+ # interactive=True,
257
+ # )
258
+ # with gr.Column(scale=1, min_width=120):
259
+ # send_btn = gr.Button("➡️ Send", scale=1, variant="primary")
260
+
261
+ # def format_chat_html(history: List[Dict[str, str]]) -> str:
262
+ # """Converts chat history to styled HTML."""
263
+ # html = "<div class='chatbot'>"
264
+ # for message in history:
265
+ # role, content = message["role"], message["content"]
266
+ # bubble_class = "user-bubble" if role == "user" else "bot-bubble"
267
+ # html_content = markdown.markdown(content, extensions=['tables']) if role == "assistant" else content
268
+ # html += f"<div class='{bubble_class}'>{html_content}</div>"
269
+ # html += "</div>"
270
+ # return html
271
+
272
+ # def respond(user_text: str, file_obj: Any, history: List[Dict[str, str]], audience_choice: str):
273
+ # # 1. Process inputs
274
+ # document_text = process_uploaded_file(file_obj)
275
+ # if not user_text.strip() and not document_text:
276
+ # return "", format_chat_html(history), history, "Status: Please type a message or upload a file.", None
277
+
278
+ # user_message_for_history = user_text
279
+ # if document_text:
280
+ # file_name = os.path.basename(file_obj.name)
281
+ # user_message_for_history += f"\n\n*📎 [File Attached: {file_name}]*"
282
+
283
+ # # 2. Route intent
284
+ # forced = {"Student": "student", "University-Research": "university", "Company": "company"}.get(audience_choice)
285
+ # route = route_intent(user_text, has_file=bool(document_text), forced_audience=forced)
286
+ # status_text = f"**Audience**: {route.audience} | **Intent**: {route.intent} | **Lang**: {route.language.upper()}"
287
+
288
+ # # 3. Handle different intents
289
+ # answer = ""
290
+ # rag_query_text = user_text
291
+
292
+ # # --- RAG Logic Branch ---
293
+ # if weaviate_client and route.intent in ["find_job", "find_opportunity", "find_project", "analyze_cv_for_opportunities"]:
294
+ # if route.intent == "analyze_cv_for_opportunities":
295
+ # status_text += " | Analyzing CV..."
296
+ # # Use LLM to get a search query from the CV
297
+ # rag_query_text = analyze_cv_with_llm(document_text)
298
+
299
+ # if "job" in route.intent or "cv" in route.intent:
300
+ # retrieved_data = query_weaviate("Job", rag_query_text, ["title", "company", "location", "description"])
301
+ # elif "opportunity" in route.intent:
302
+ # retrieved_data = query_weaviate("Opportunities", rag_query_text, ["title", "type", "organization", "summary"])
303
+ # elif "project" in route.intent:
304
+ # retrieved_data = query_weaviate("Project", rag_query_text, ["title", "field", "university", "abstract"])
305
+
306
+ # # Combine retrieved data with LLM for a natural response
307
+ # system_prompt = RAG_PROMPT_TEMPLATE.format(retrieved_data=retrieved_data)
308
+ # answer = call_llm(user_text, history, system_prompt)
309
+
310
+ # # --- KB/General LLM Logic Branch ---
311
+ # else:
312
+ # if route.intent in KB:
313
+ # answer = KB[route.intent].get(route.language, KB[route.intent]["en"])
314
+ # else:
315
+ # answer = call_llm(user_text, history, SYSTEM_PROMPT_BASE)
316
+
317
+ # # 4. Update history, log, and format for UI
318
+ # history.append({"role": "user", "content": user_message_for_history})
319
+ # history.append({"role": "assistant", "content": answer})
320
+
321
+ # log_interaction({
322
+ # "timestamp": time.time(),
323
+ # "user_message": user_text,
324
+ # "file_uploaded": file_obj.name if file_obj else None,
325
+ # "audience": route.audience,
326
+ # "intent": route.intent,
327
+ # "language": route.language,
328
+ # "bot_response": answer
329
+ # })
330
+
331
+ # updated_html = format_chat_html(history)
332
+ # return "", updated_html, history, status_text, None # Clear text input and file uploader
333
+
334
+ # def clear_chat():
335
+ # """Clears the chat history and UI components."""
336
+ # return "", [], "Status: Ready.", None
337
+
338
+ # # Event Handlers
339
+ # send_btn.click(respond, inputs=[msg, file_uploader, chat_history_state, audience_dd], outputs=[msg, chatbot_html, chat_history_state, status, file_uploader], queue=True)
340
+ # msg.submit(respond, inputs=[msg, file_uploader, chat_history_state, audience_dd], outputs=[msg, chatbot_html, chat_history_state, status, file_uploader], queue=True)
341
+ # clear_btn.click(clear_chat, outputs=[chatbot_html, chat_history_state, status, file_uploader], queue=False)
342
+
343
+ # if __name__ == "__main__":
344
+ # demo.launch(debug=True)
345
+
346
+
347
+
348
+
349
+ # app.py -- Full EduNatives chatbot with RAG + Application + Team flows
350
  from __future__ import annotations
351
  import os
352
  import json
353
  import time
354
+ import re
355
  import uuid
356
+ import datetime
357
  from dataclasses import dataclass
358
+ from typing import List, Dict, Any, Optional
359
 
 
360
  import markdown
361
  import gradio as gr
 
 
362
  from openai import OpenAI
 
 
 
363
  import fitz # PyMuPDF
364
  import docx
365
+ import weaviate
366
+ from weaviate.classes.init import Auth
367
+ from weaviate.classes.config import Configure, Property, DataType
368
 
369
+ # -------------------- Configuration --------------------
370
+ MODEL_NAME = os.getenv("MODEL_NAME", "openai/gpt-oss-120b")
371
+ DEEPINFRA_API_KEY = os.getenv("DEEPINFRA_API_KEY", "kPEm10rrnxXrCf0TuB6Xcd7Y7lp3YgKa")
372
+ BASE_URL = os.getenv("BASE_URL", "https://api.deepinfra.com/v1/openai")
373
 
374
+ WEAVIATE_URL = os.getenv("WEAVIATE_URL", "htorgbgpt4w63nvf1yeuw.c0.us-west3.gcp.weaviate.cloud")
375
+ WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "ZUd6clB5WmYzVGkxeU40cl96NTY5UkViUlVzY05Md3IzQ0JKelBZQmxGZHRPeGpCeGdxS1FUNnlYUkFFPV92MjAw")
 
 
 
376
 
377
+ MEMORY_FILE = os.getenv("MEMORY_FILE", "chat_memory.json")
378
+ LOG_FILE = os.getenv("LOG_FILE", "chat_analytics.json")
379
 
380
+ # -------------------- Clients --------------------
381
+ # LLM client
382
  llm_client = OpenAI(api_key=DEEPINFRA_API_KEY, base_url=BASE_URL)
383
 
384
+ # Weaviate client
385
+ weaviate_client = weaviate.connect_to_weaviate_cloud(
386
+ cluster_url=WEAVIATE_URL,
387
+ auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  )
389
 
390
+ # -------------------- KB, Keys, prompts --------------------
 
 
 
 
 
 
391
  KB: Dict[str, Dict[str, str]] = {
392
  "student_registration": {
393
+ "en": (
394
+ "**How to register / create an account (Student)**\n\n"
395
+ "1. Go to the EduNatives site and choose Sign Up.\n"
396
+ "2. Use your university email if possible and verify it.\n"
397
+ "3. Complete your profile (major, skills, interests).\n"
398
+ "4. Enable notifications for internships/scholarships."
399
+ ),
400
+ "ar": (
401
+ "**طريقة التسجيل وإنشاء حساب (طلاب)**\n\n"
402
+ "١. اذهب إلى موقع EduNatives واختر Sign Up.\n"
403
+ "٢. يفضل استخدام إيميل الجامعة وتأكيده.\n"
404
+ "٣. أكمل ملفك الشخصي (التخصص، المهارات، الاهتمامات).\n"
405
+ "٤. فعّل التنبيهات لفرص التدريب والمنح."
406
+ ),
407
+ },
408
+ "student_internships": {
409
+ "en": (
410
+ "**Finding internships & scholarships**\n\n"
411
+ "- Use the search filters: field, location, duration, paid/unpaid.\n"
412
+ "- Follow companies and set up alerts for new opportunities.\n"
413
+ "- Keep your profile and resume updated."
414
+ ),
415
+ "ar": (
416
+ "**كيفية العثور على تدريب أو منحة**\n\n"
417
+ "- استخدم فلاتر البحث: التخصص، المكان، المدة، مدفوع/غير مدفوع.\n"
418
+ "- تابع الشركات وفعّل التنبيهات للفرص الجديدة.\n"
419
+ "- حافظ على تحديث ملفك الشخصي وسيرتك الذاتية."
420
+ ),
421
  },
422
  }
423
 
 
424
  KEYS = {
425
+ "student_registration": ["register", "sign up", "signup", "create account", "account", "تسجيل", "انشاء", "إنشاء", "حساب", "اعمل حساب", "سجل"],
426
+ "student_internships": ["intern", "internship", "training", "scholar", "scholarship", "grant", "opportunity", "تدريب", "تدريبي", "منحة", "منح", "فرصة", "فرص", "انترنشيب"],
427
+ "student_mentors": ["mentor", "advisor", "professor", "supervisor", "faculty", "connect", "منتور", "مشرف", "دكتور", "أستاذ", "استاذ", "التواصل", "اكلم"],
428
+ "university_publish": ["publish", "paper", "research", "preprint", "conference", "event", "seminar", "webinar", "نشر", "أبحاث", "ابحاث", "بحث", "مؤتمر", "فعالية", "فعاليات", "ندوة", "ورشة"],
429
+ "university_connect": ["students", "connect with students", "reach students", "collaborate", "طلاب", "تواصل مع الطلاب", "التواصل مع الطلاب", "تعاون"],
430
+ "company_post_jobs": ["job", "jobs", "post job", "hiring", "hire", "internships", "graduate", "وظيفة", "وظائف", "اعلان", "إعلان", "نشر وظيفة", "توظيف", "فرص تدريب", "خريجين"],
431
+ "company_find_talent": ["talent", "candidate", "recruit", "search", "find", "pipeline", "موهبة", "مواهب", "مرشحين", "تعيين", "تجنيد", "ابحث", "دور على"],
432
+ "project_query": ["project", "projects", "مشروع", "مشاريع", "هدف", "أهداف"],
433
+ "apply_job_opportunity": ["apply", "application", "تقديم", "طلب", "عايز اقدم", "اريد التقديم", "اريد اتقدم"],
434
+ "join_team": ["team", "join team", "فريق", "انضمام لفريق", "انضمام", "انضم"],
435
  }
436
 
 
437
  AUDIENCE_MAP = {
 
 
 
 
438
  "student_registration": "student",
439
+ "student_internships": "student",
440
  "student_mentors": "student",
441
  "university_publish": "university",
442
+ "university_connect": "university",
443
+ "company_post_jobs": "company",
444
+ "company_find_talent": "company",
445
+ "project_query": "student",
446
+ "apply_job_opportunity": "student",
447
+ "join_team": "student",
448
  }
449
 
 
450
  SYSTEM_PROMPT_BASE = (
451
+ "You are **EduNatives Assistant**, a helpful, friendly, and precise academic/career guide. "
452
+ "You serve three primary audiences: Students, Universities/Researchers, and Companies.\n\n"
453
+ "Goals by audience:\n"
454
+ "- Students: registration/account help; finding internships/scholarships; connecting with mentors or professors; querying projects; applying for jobs/opportunities; joining project teams.\n"
455
+ "- Universities/Researchers: publish research or announce events; connect/collaborate with students.\n"
456
+ "- Companies: post jobs/internships/graduate roles; discover student talent.\n\n"
457
+ "General rules:\n"
458
+ "- Reply in the user's language (Arabic if the user writes Arabic; otherwise English).\n"
459
+ "- Use data from Weaviate collections (Job, Opportunities, Project) when relevant.\n"
460
+ "- Be concise, step-by-step, and action-oriented (lists, bullets, checklists).\n"
461
+ "- If information is unavailable, state that clearly and suggest the next best step.\n"
462
+ "- For CV analysis, extract skills/experience and recommend matching opportunities.\n"
463
+ "- Assist with applications and team matching.\n"
464
+ "- Ensure that all generated prompts are phrased using positive reinforcement."
465
+ )
466
+
467
+ CONTEXT_INJECT_TEMPLATE = (
468
+ "Context to guide your answer (do not repeat verbatim):\n"
469
+ "- Audience: {audience}\n- Intent: {intent}\n- Extra hints: Keep it practical for this audience."
470
  )
471
 
472
+ # -------------------- Utility helpers --------------------
473
+ ARABIC_RANGE = (
474
+ (0x0600, 0x06FF), (0x0750, 0x077F), (0x08A0, 0x08FF),
475
+ (0xFB50, 0xFDFF), (0xFE70, 0xFEFF), (0x1EE00, 0x1EEFF)
476
  )
477
 
478
+ def is_arabic(text: str) -> bool:
479
+ for ch in text:
480
+ code = ord(ch)
481
+ for a, b in ARABIC_RANGE:
482
+ if a <= code <= b:
483
+ return True
484
+ return False
485
+
486
+ def format_chat_html(history: List[Dict[str, str]]) -> str:
487
+ html = "<div class='chatbot'>"
488
+ for message in history:
489
+ role = message["role"]
490
+ content = message["content"]
491
+ if role == "user":
492
+ html += f"<div class='user-bubble'>{content}</div>"
493
+ elif role == "assistant":
494
+ html_content = markdown.markdown(content, extensions=['tables'])
495
+ html += f"<div class='bot-bubble'>{html_content}</div>"
496
+ html += "</div>"
497
+ return html
498
+
499
+ # Simple keyword-based CV skills/experience extractor (improvable)
500
+ _SKILL_REGEX = re.compile(r"\b(python|java|c\+\+|c#|javascript|nlp|machine learning|deep learning|data science|sql|aws|azure|docker|kubernetes|react|node\.js)\b", re.IGNORECASE)
501
+ _EXP_REGEX = re.compile(r"(\d+)\s*(?:years|year|months|month)\s*(?:of)?\s*(?:experience|exp|worked)", re.IGNORECASE)
502
+
503
+ def extract_skills_experience(text: str) -> Dict[str, List[str]]:
504
+ skills = list({m.group(0).lower() for m in _SKILL_REGEX.finditer(text)})
505
+ experiences = [m.group(0) for m in _EXP_REGEX.finditer(text)]
506
+ return {"skills": skills, "experience": experiences}
507
+
508
+ # -------------------- Ensure auxiliary collections --------------------
509
+ def ensure_aux_collections():
510
+ # Team
511
+ if not weaviate_client.collections.exists("Team"):
512
+ weaviate_client.collections.create(
513
+ name="Team",
514
+ properties=[
515
+ Property(name="teamId", data_type=DataType.TEXT),
516
+ Property(name="name", data_type=DataType.TEXT),
517
+ Property(name="projectId", data_type=DataType.TEXT),
518
+ Property(name="members", data_type=DataType.TEXT_ARRAY),
519
+ Property(name="createdAt", data_type=DataType.DATE),
520
+ Property(name="creatorId", data_type=DataType.TEXT),
521
+ ],
522
+ vectorizer_config=Configure.Vectorizer.none()
523
+ )
524
+
525
+ # Application
526
+ if not weaviate_client.collections.exists("Application"):
527
+ weaviate_client.collections.create(
528
+ name="Application",
529
+ properties=[
530
+ Property(name="applicationId", data_type=DataType.TEXT),
531
+ Property(name="jobId", data_type=DataType.TEXT),
532
+ Property(name="opportunityId", data_type=DataType.TEXT),
533
+ Property(name="applicantName", data_type=DataType.TEXT),
534
+ Property(name="applicantEmail", data_type=DataType.TEXT),
535
+ Property(name="coverLetter", data_type=DataType.TEXT),
536
+ Property(name="cvText", data_type=DataType.TEXT),
537
+ Property(name="createdAt", data_type=DataType.DATE),
538
+ ],
539
+ vectorizer_config=Configure.Vectorizer.none()
540
+ )
541
 
542
+ # Memory
543
+ if not weaviate_client.collections.exists("Memory"):
544
+ weaviate_client.collections.create(
545
+ name="Memory",
546
+ properties=[
547
+ Property(name="memoryId", data_type=DataType.TEXT),
548
+ Property(name="sessionId", data_type=DataType.TEXT),
549
+ Property(name="text", data_type=DataType.TEXT),
550
+ Property(name="createdAt", data_type=DataType.DATE),
551
+ ],
552
+ vectorizer_config=Configure.Vectorizer.none()
553
+ )
554
+
555
+ ensure_aux_collections()
556
+
557
+ # -------------------- Weaviate query helpers (RAG) --------------------
558
+ def query_weaviate_collection(class_name: str, query_text: str, limit: int = 5) -> List[dict]:
559
+ """
560
+ Query using v4 weaviate client (hybrid search).
561
+ """
562
+ try:
563
+ collection = weaviate_client.collections.get(class_name)
564
 
565
+ # Hybrid search
566
+ res = collection.query.hybrid(query=query_text, limit=limit)
567
+
568
+ items = [o.properties for o in res.objects]
569
+
570
+ # --- fallback لو مفيش نتيجة
571
+ if not items:
572
+ print(f"[Hybrid returned 0 → fallback filter on {class_name}]")
573
+ res2 = collection.query.fetch_objects(limit=limit) # مجرد fallback بسيط
574
+ items = [o.properties for o in res2.objects]
575
+
576
+ return items
577
+ except Exception as e:
578
+ print(f"[Weaviate Query Error] class={class_name} error={e}")
579
+ return []
580
+
581
+
582
+ def build_rag_prompt(user_question: str, retrieved_items: List[dict], class_name: str) -> str:
583
+ intro = f"Use the following {len(retrieved_items)} records from {class_name} to answer the question succinctly.\n\n"
584
+ parts = []
585
+ for i, item in enumerate(retrieved_items, 1):
586
+ if class_name == "Job":
587
+ parts.append(f"{i}. Title: {item.get('title','N/A')} | Company: {item.get('companyName','N/A')} | Skills: {', '.join(item.get('skills',[]))} | Desc: {item.get('description','')[:200]}")
588
+ elif class_name == "Opportunities":
589
+ parts.append(f"{i}. Title: {item.get('title','N/A')} | Topic: {item.get('topic','N/A')} | Skills: {', '.join(item.get('skills',[]))} | Overview: {item.get('overview','')[:200]}")
590
+ elif class_name == "Project":
591
+ parts.append(f"{i}. Title: {item.get('title','N/A')} | ShortDesc: {item.get('shortDescription','')[:200]} | Fields: {', '.join(item.get('fields',[]))}")
592
+ else:
593
+ parts.append(f"{i}. {str(item)[:200]}")
594
+ context_block = "\n".join(parts)
595
+ closing = f"\n\nQuestion: {user_question}\nAnswer concisely and, if applicable, include next steps (how to apply / contact / form a team)."
596
+ return intro + context_block + closing
597
+
598
+ def rag_answer(user_question: str, class_name: str, top_k: int = 5) -> (str, List[dict]):
599
+ retrieved = query_weaviate_collection(class_name, user_question, limit=top_k)
600
+ if not retrieved:
601
+ return "", []
602
+ prompt = build_rag_prompt(user_question, retrieved, class_name)
603
+ try:
604
+ resp = llm_client.chat.completions.create(
605
+ model=MODEL_NAME,
606
+ messages=[
607
+ {"role": "system", "content": SYSTEM_PROMPT_BASE},
608
+ {"role": "user", "content": prompt}
609
+ ],
610
+ temperature=0.2,
611
+ max_tokens=512
612
+ )
613
+ answer = resp.choices[0].message.content or ""
614
+ except Exception as e:
615
+ print(f"[RAG LLM Error] {e}")
616
+ answer = ""
617
+ return answer, retrieved
618
+
619
+ # -------------------- Save helpers --------------------
620
+ def save_application_to_weaviate(application: dict) -> bool:
621
+ try:
622
+ collection = weaviate_client.collections.get("Application")
623
+ # ensure createdAt exists
624
+ application.setdefault("createdAt", datetime.datetime.utcnow().isoformat() + "Z")
625
+ # add stable uuid
626
+ uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, application.get("applicationId", str(uuid.uuid4()))))
627
+ collection.data.insert(properties=application, uuid=uid)
628
+ return True
629
+ except Exception as e:
630
+ print(f"[Save Application Error] {e}")
631
+ return False
632
+
633
+ def save_team_to_weaviate(team_props: dict) -> Optional[dict]:
634
+ try:
635
+ collection = weaviate_client.collections.get("Team")
636
+ team_props.setdefault("createdAt", datetime.datetime.utcnow().isoformat() + "Z")
637
+ uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, team_props.get("teamId", str(uuid.uuid4()))))
638
+ collection.data.insert(properties=team_props, uuid=uid)
639
+ return team_props
640
+ except Exception as e:
641
+ print(f"[Save Team Error] {e}")
642
+ return None
643
+
644
+ def save_memory_to_weaviate(session_id: str, text: str) -> bool:
645
+ try:
646
+ collection = weaviate_client.collections.get("Memory")
647
+ mem = {"memoryId": str(uuid.uuid4()), "sessionId": session_id, "text": text, "createdAt": datetime.datetime.utcnow().isoformat() + "Z"}
648
+ collection.data.insert(properties=mem, uuid=str(uuid.uuid5(uuid.NAMESPACE_DNS, mem["memoryId"])))
649
+ return True
650
+ except Exception as e:
651
+ print(f"[Save Memory Error] {e}")
652
+ return False
653
+
654
+ # -------------------- File processing --------------------
655
+ def process_uploaded_file(file_obj: Any) -> dict | None:
656
+ """
657
+ file_obj is a Gradio file (file_obj.name present)
658
+ returns dict with 'content' and 'profile' keys or None
659
+ """
660
+ if not file_obj:
661
  return None
662
  file_path = file_obj.name
663
+ filename = os.path.basename(file_path)
664
  text_content = ""
665
  try:
666
+ if filename.lower().endswith(".pdf"):
667
  with fitz.open(file_path) as doc:
668
+ for page in doc:
669
+ text_content += page.get_text()
670
+ elif filename.lower().endswith(".docx"):
671
  doc = docx.Document(file_path)
672
+ for p in doc.paragraphs:
673
+ text_content += p.text + "\n"
674
+ elif filename.lower().endswith(".txt"):
675
  with open(file_path, "r", encoding="utf-8") as f:
676
  text_content = f.read()
677
  else:
678
+ return {"error": f"Unsupported file type: {filename}"}
679
+ profile = extract_skills_experience(text_content)
680
+ return {"content": text_content.strip(), "profile": profile, "filename": filename}
681
  except Exception as e:
682
+ print(f"[File process error] {e}")
683
+ return {"error": f"Error processing file {filename}: {e}"}
684
 
685
+ # -------------------- Logging --------------------
686
+ def log_interaction(user_message: str, route: 'Route', response: str):
687
+ entry = {"timestamp": time.time(), "user_message": user_message, "audience": route.audience, "intent": route.intent, "language": route.language, "response": response}
688
+ try:
689
+ if os.path.exists(LOG_FILE):
690
+ with open(LOG_FILE, "r", encoding="utf-8") as f:
691
+ logs = json.load(f)
692
+ else:
693
+ logs = []
694
+ logs.append(entry)
695
+ with open(LOG_FILE, "w", encoding="utf-8") as f:
696
+ json.dump(logs, f, ensure_ascii=False, indent=2)
697
+ except Exception as e:
698
+ print(f"[Log error] {e}")
699
 
700
+ # -------------------- Intent routing --------------------
701
+ @dataclass
702
+ class Route:
703
+ audience: str
704
+ intent: str
705
+ language: str
 
 
706
 
707
+ def route_intent(text: str, forced_audience: str | None = None) -> Route:
708
+ lang = "ar" if is_arabic(text) else "en"
709
+ text_l = text.lower() if text else ""
710
  match_label = None
711
  for label, kws in KEYS.items():
712
+ for kw in kws:
713
+ if kw in text_l:
714
+ match_label = label
715
+ break
716
+ if match_label:
717
  break
718
+ audience = AUDIENCE_MAP.get(match_label, "general")
719
+ if forced_audience and forced_audience in {"student", "university", "company"}:
720
+ audience = forced_audience
721
+ return Route(audience=audience, intent=match_label or "general", language=lang)
722
+
723
+ # -------------------- call_llm --------------------
724
+ def call_llm(user_message: str, history: List[Dict[str, str]], route: Route, temperature: float = 0.6, max_tokens: int = 512) -> str:
725
+ messages = [
726
+ {"role": "system", "content": SYSTEM_PROMPT_BASE},
727
+ {"role": "system", "content": CONTEXT_INJECT_TEMPLATE.format(audience=route.audience, intent=route.intent)}
728
+ ]
729
+ # include last few turns
730
+ MAX_TURNS = 3
731
+ trimmed = history[-MAX_TURNS*2:] if history else []
732
+ messages.extend(trimmed)
733
  messages.append({"role": "user", "content": user_message})
 
734
  try:
735
  resp = llm_client.chat.completions.create(
736
+ model=MODEL_NAME,
737
+ messages=messages,
738
+ temperature=temperature,
739
+ max_tokens=max_tokens
740
  )
741
  return resp.choices[0].message.content or ""
742
  except Exception as e:
743
+ print(f"[LLM Error] {e}")
744
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
745
 
746
+ # -------------------- Main respond flow (Gradio) --------------------
747
  with gr.Blocks(css="""
748
  .chatbot {height: 500px; overflow: auto;}
749
  .user-bubble {background-color: #DCF8C6; padding: 10px; border-radius: 12px; max-width: 75%; float: right; clear: both; margin: 5px; word-wrap: break-word;}
 
753
  .bot-bubble th, .bot-bubble td {border: 1px solid #ddd; padding: 8px; text-align: left;}
754
  .bot-bubble th {background-color: #e9e9e9;}
755
  """) as demo:
756
+
757
+ gr.Markdown("# 🤖 EduNatives Assistant")
758
 
759
  with gr.Row():
760
+ audience_dd = gr.Dropdown(label="Audience", choices=["Auto", "Student", "University-Research", "Company"], value="Auto", interactive=True)
 
 
 
 
 
 
761
  clear_btn = gr.Button("🧹 Clear Chat")
762
 
763
  status = gr.Markdown("Status: Ready.")
764
  chatbot_html = gr.HTML("<div class='chatbot' id='chatbot'></div>")
765
  chat_history_state = gr.State([])
766
+ user_id_state = gr.State("default_user")
767
 
768
  with gr.Row(elem_classes="chatbox-container"):
769
+ msg = gr.Textbox(placeholder="اكتب سؤالك هنا... / Ask your question here...", lines=2, scale=4, autofocus=True)
770
+ file_uploader = gr.File(label="Upload Document (.txt, .pdf, .docx)", file_types=[".txt", ".pdf", ".docx"], file_count="single", interactive=True)
 
 
 
 
 
 
 
771
  with gr.Column(scale=1, min_width=120):
772
  send_btn = gr.Button("➡️ Send", scale=1, variant="primary")
773
 
774
+ def respond(user_text: str, file_obj: Any, history: List[Dict[str, str]], audience_choice: str, user_id: str):
775
+ user_text = (user_text or "").strip()
776
+ # process file if exists
777
+ doc_info = process_uploaded_file(file_obj) if file_obj else None
778
+ if not user_text and not doc_info:
779
+ return "", format_chat_html(history), history, "Status: Please type a message or upload a file.", None, user_id
780
+
781
+ # build combined input
782
+ llm_input = user_text
783
+ if doc_info and "content" in doc_info:
784
+ llm_input = f"Based on the document content below, answer the question.\n\n---DOCUMENT---\n{doc_info['content'][:6000]}\n---END DOCUMENT---\n\nQuestion: {user_text}"
 
 
 
 
 
785
 
 
 
 
 
 
 
786
  forced = {"Student": "student", "University-Research": "university", "Company": "company"}.get(audience_choice)
787
+ route = route_intent(llm_input, forced_audience=forced)
788
  status_text = f"**Audience**: {route.audience} | **Intent**: {route.intent} | **Lang**: {route.language.upper()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
 
790
+ # quick CV skills if provided
791
+ cv_profile = doc_info.get("profile") if doc_info and "profile" in doc_info else {"skills": [], "experience": []}
792
+
793
+ # Decide RAG target based on intent / keywords
794
+ text_lower = (user_text or "").lower()
795
+ wants_project = any(k in text_lower for k in KEYS.get("project_query", []))
796
+ wants_job = any(k in text_lower for k in KEYS.get("company_post_jobs", [])) or any(k in text_lower for k in KEYS.get("apply_job_opportunity", []))
797
+ wants_opp = any(k in text_lower for k in KEYS.get("student_internships", []))
798
+
799
+ final_answer = ""
800
+ retrieved = []
801
+
802
+ try:
803
+ if wants_project or route.intent == "project_query":
804
+ final_answer, retrieved = rag_answer(user_text or (doc_info or {}).get("content",""), "Project", top_k=5)
805
+ if not final_answer:
806
+ final_answer = KB.get("student_internships", {}).get(route.language, "") if route.intent == "student_internships" else ""
807
+ elif wants_job or route.intent == "apply_job_opportunity" or route.intent == "company_post_jobs":
808
+ # try jobs first
809
+ final_answer, retrieved = rag_answer(user_text or (doc_info or {}).get("content",""), "Job", top_k=5)
810
+ if not final_answer:
811
+ # fallback to ops
812
+ final_answer, retrieved = rag_answer(user_text or (doc_info or {}).get("content",""), "Opportunities", top_k=5)
813
+ elif wants_opp or route.intent == "student_internships":
814
+ final_answer, retrieved = rag_answer(user_text or (doc_info or {}).get("content",""), "Opportunities", top_k=5)
815
  else:
816
+ # default LLM response with context
817
+ final_answer = call_llm(llm_input, history, route) or KB.get("student_registration", {}).get(route.language, "Sorry, I don't have info.")
818
+ except Exception as e:
819
+ print(f"[Respond Error] {e}")
820
+ final_answer = call_llm(llm_input, history, route) or KB.get("student_registration", {}).get(route.language, "")
821
+
822
+ # --- Application flow: if user indicated they want to apply (keywords) ---
823
+ wants_apply = any(kw in text_lower for kw in KEYS.get("apply_job_opportunity", []))
824
+ app_summary = ""
825
+ generated_cover = ""
826
+ if wants_apply:
827
+ target = retrieved[0] if retrieved else None
828
+ cover_text = ""
829
+ if target:
830
+ # build cover letter prompt using target and CV text if present
831
+ cover_prompt = f"Write a concise 3-paragraph cover letter applying for this role:\nRole details: {json.dumps(target, ensure_ascii=False)[:1500]}\n"
832
+ if doc_info and "content" in doc_info:
833
+ cover_prompt += f"\nApplicant CV summary: {doc_info['content'][:2000]}\n"
834
+ cover_prompt += "\nWrite the letter in the same language as the user."
835
+ try:
836
+ resp = llm_client.chat.completions.create(
837
+ model=MODEL_NAME,
838
+ messages=[{"role":"system","content":SYSTEM_PROMPT_BASE},{"role":"user","content":cover_prompt}],
839
+ temperature=0.3,
840
+ max_tokens=512
841
+ )
842
+ cover_text = resp.choices[0].message.content or ""
843
+ except Exception as e:
844
+ print(f"[Cover generation error] {e}")
845
+ cover_text = "I can help craft a cover letter, but an error occurred while generating it."
846
+
847
+ application = {
848
+ "applicationId": str(uuid.uuid4()),
849
+ "jobId": target.get("id") or target.get("jobId"),
850
+ "opportunityId": target.get("id") or target.get("opportunityId"),
851
+ "applicantName": "Unknown",
852
+ "applicantEmail": "Unknown",
853
+ "coverLetter": cover_text,
854
+ "cvText": doc_info.get("content","")[:4000] if doc_info else "",
855
+ "createdAt": datetime.datetime.now().isoformat()
856
+ }
857
+ saved = save_application_to_weaviate(application)
858
+ app_summary = "✅ Application prepared and saved." if saved else "⚠️ Application could not be saved."
859
+ generated_cover = cover_text
860
+ else:
861
+ app_summary = "لم أجد وظيفة/فرصة مناسبة تلقائياً من النتائج. أرسل عنوان الوظيفة أو اختر من النتيجة."
862
+
863
+ # --- Team creation flow (join_team) ---
864
+ team_created_msg = ""
865
+ wants_team = any(k in text_lower for k in KEYS.get("join_team", []))
866
+ if wants_team and (wants_project or route.intent == "join_team"):
867
+ # attempt to suggest members using cv skills or create empty team entry
868
+ suggested_members = []
869
+ if cv_profile and cv_profile.get("skills"):
870
+ # use skills to search opportunities maybe get studentName fields
871
+ matches = query_weaviate_collection("Opportunities", " ".join(cv_profile.get("skills", [])), limit=5)
872
+ for m in matches:
873
+ name = m.get("studentName") or m.get("student") or m.get("name")
874
+ if name:
875
+ suggested_members.append(name)
876
+ if not suggested_members:
877
+ team_props = {
878
+ "teamId": str(uuid.uuid4()),
879
+ "name": f"Team for project - {uuid.uuid4().hex[:6]}",
880
+ "projectId": retrieved[0].get("globalId") if retrieved and retrieved[0].get("globalId") else None,
881
+ "members": [],
882
+ "createdAt": datetime.datetime.utcnow().isoformat()+"Z",
883
+ "creatorId": user_id
884
+ }
885
+ saved_team = save_team_to_weaviate(team_props)
886
+ if saved_team:
887
+ team_created_msg = f"✅ Team created with id {team_props['teamId']}. يمكنك إضافة أعضاء لاحقًا."
888
+ else:
889
+ team_created_msg = "⚠️ لم أتمكن من إنشاء الفريق الآن."
890
+ else:
891
+ team_props = {
892
+ "teamId": str(uuid.uuid4()),
893
+ "name": f"Team for project - {uuid.uuid4().hex[:6]}",
894
+ "projectId": retrieved[0].get("globalId") if retrieved and retrieved[0].get("globalId") else None,
895
+ "members": suggested_members,
896
+ "createdAt": datetime.datetime.utcnow().isoformat()+"Z",
897
+ "creatorId": user_id
898
+ }
899
+ saved_team = save_team_to_weaviate(team_props)
900
+ if saved_team:
901
+ team_created_msg = f"✅ Team created with members: {', '.join(suggested_members)}"
902
+ else:
903
+ team_created_msg = "⚠️ لم أتمكن من إنشاء الفريق الآن."
904
+
905
+ # Save a short memory entry
906
+ try:
907
+ sess = str(uuid.uuid5(uuid.NAMESPACE_DNS, (user_id or "anon") + (user_text or "")[:50]))
908
+ mem_text = f"User: {user_text[:300]} | Action: RAG on { 'Project' if wants_project else 'Job' if wants_job else 'Opportunities' if wants_opp else 'LLM' }"
909
+ save_memory_to_weaviate(sess, mem_text)
910
+ except Exception as e:
911
+ print(f"[Memory Save Error] {e}")
912
+
913
+ # Prepare final message (answer + top results + app/team status)
914
+ message_parts = []
915
+ if final_answer:
916
+ message_parts.append(final_answer)
917
+ if retrieved:
918
+ list_lines = []
919
+ for item in retrieved[:5]:
920
+ title = item.get("title") or item.get("jobTitle") or item.get("globalId") or "No Title"
921
+ meta = item.get("companyName") or item.get("topic") or item.get("shortDescription","")
922
+ list_lines.append(f"- **{title}** | {meta}")
923
+ if list_lines:
924
+ message_parts.append("\n\n**Top results:**\n" + "\n".join(list_lines))
925
+ if wants_apply:
926
+ message_parts.append("\n\n**Application status:** " + app_summary)
927
+ if generated_cover:
928
+ message_parts.append("\n\n**Generated Cover Letter:**\n" + generated_cover)
929
+ if team_created_msg:
930
+ message_parts.append("\n\n" + team_created_msg)
931
+
932
+ final_message_to_user = "\n\n".join([p for p in message_parts if p])
933
+
934
+ # Update history
935
+ user_message_for_history = user_text
936
+ if doc_info and doc_info.get("filename"):
937
+ user_message_for_history += f"\n\n*📎 [File Attached: {doc_info.get('filename')}] *"
938
 
 
939
  history.append({"role": "user", "content": user_message_for_history})
940
+ history.append({"role": "assistant", "content": final_message_to_user or "عذراً، لم أجد إجابة مناسبة الآن."})
 
 
 
 
 
 
 
 
 
 
 
 
 
941
 
942
+ # log interaction
943
+ try:
944
+ log_interaction(user_text, route, final_message_to_user)
945
+ except Exception:
946
+ pass
947
 
948
+ # Return values: clear input field, updated html, updated history, status, clear file uploader, keep user_id
949
+ return "", format_chat_html(history), history, status_text, None, user_id
950
+
951
+ def clear_chat():
952
+ return "", [], "Status: Ready.", None, "default_user"
953
+
954
+ # Bind events
955
+ send_btn.click(
956
+ respond,
957
+ inputs=[msg, file_uploader, chat_history_state, audience_dd, user_id_state],
958
+ outputs=[msg, chatbot_html, chat_history_state, status, file_uploader, user_id_state],
959
+ queue=True
960
+ )
961
+ msg.submit(
962
+ respond,
963
+ inputs=[msg, file_uploader, chat_history_state, audience_dd, user_id_state],
964
+ outputs=[msg, chatbot_html, chat_history_state, status, file_uploader, user_id_state],
965
+ queue=True
966
+ )
967
+ clear_btn.click(
968
+ clear_chat,
969
+ outputs=[msg, chatbot_html, chat_history_state, status, file_uploader, user_id_state],
970
+ queue=False
971
+ )
972
 
973
  if __name__ == "__main__":
974
  demo.launch(debug=True)