tdecae commited on
Commit
d8c2382
·
verified ·
1 Parent(s): 2c28fae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -124
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import sys
3
  import requests
4
 
5
- # SQLite workaround for Chroma on Hugging Face Spaces
6
  try:
7
  __import__("pysqlite3")
8
  sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
@@ -17,9 +17,11 @@ from langchain_chroma import Chroma
17
  from langchain_huggingface import HuggingFaceEmbeddings
18
 
19
 
 
 
 
20
  DOCS_DIR = "multiple_docs"
21
  DB_DIR = "./db"
22
- COLLECTION_NAME = "thierry_recruitment_docs"
23
 
24
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
25
  DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
@@ -31,9 +33,12 @@ WELCOME_MESSAGE = (
31
  )
32
 
33
 
34
- def call_deepseek(messages, temperature=0.4, max_tokens=700):
 
 
 
35
  if not DEEPSEEK_API_KEY:
36
- return "DEEPSEEK_API_KEY is not set in the Hugging Face Space secrets."
37
 
38
  headers = {
39
  "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
@@ -43,211 +48,166 @@ def call_deepseek(messages, temperature=0.4, max_tokens=700):
43
  payload = {
44
  "model": "deepseek-chat",
45
  "messages": messages,
46
- "temperature": temperature,
47
- "max_tokens": max_tokens,
48
  }
49
 
50
- response = requests.post(
51
- DEEPSEEK_API_URL,
52
- headers=headers,
53
- json=payload,
54
- timeout=60,
55
- )
56
  response.raise_for_status()
57
 
58
- data = response.json()
59
- return data["choices"][0]["message"]["content"].strip()
60
 
61
 
 
 
 
62
  def load_documents():
63
- if not os.path.exists(DOCS_DIR):
64
- raise FileNotFoundError(f"Folder not found: {DOCS_DIR}")
65
-
66
  docs = []
67
 
68
- for filename in os.listdir(DOCS_DIR):
69
- path = os.path.join(DOCS_DIR, filename)
70
- lower = filename.lower()
71
 
72
  try:
73
- if lower.endswith(".pdf"):
74
- loader = PyPDFLoader(path)
75
- docs.extend(loader.load())
76
-
77
- elif lower.endswith(".docx"):
78
- loader = Docx2txtLoader(path)
79
- docs.extend(loader.load())
80
-
81
- elif lower.endswith(".txt"):
82
- loader = TextLoader(path, encoding="utf-8")
83
- docs.extend(loader.load())
84
-
85
  except Exception as e:
86
- print(f"Could not load {filename}: {e}", flush=True)
87
 
88
  if not docs:
89
- raise ValueError(f"No documents found in {DOCS_DIR}")
90
-
91
- splitter = CharacterTextSplitter(
92
- chunk_size=1000,
93
- chunk_overlap=100,
94
- )
95
 
 
96
  return splitter.split_documents(docs)
97
 
98
 
 
 
 
99
  def build_vectorstore():
100
- print("Loading embedding model...", flush=True)
101
 
102
- embedding_function = HuggingFaceEmbeddings(
103
  model_name="sentence-transformers/all-MiniLM-L6-v2"
104
  )
105
 
106
- print("Loading documents...", flush=True)
107
  docs = load_documents()
 
108
 
109
- print(f"Loaded {len(docs)} document chunks.", flush=True)
110
-
111
- vectorstore = Chroma.from_documents(
112
- documents=docs,
113
- embedding=embedding_function,
114
  persist_directory=DB_DIR,
115
- collection_name=COLLECTION_NAME,
116
  )
117
 
118
- return vectorstore
119
-
120
 
121
  vectorstore = build_vectorstore()
122
  retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
123
 
124
 
125
- def format_chat_history(history):
 
 
 
126
  if not history:
127
  return ""
128
 
129
  lines = []
130
-
131
- for item in history[-8:]:
132
- if not item or len(item) != 2:
133
- continue
134
-
135
- user_msg, assistant_msg = item
136
-
137
- if user_msg:
138
- lines.append(f"user: {user_msg}")
139
-
140
- if assistant_msg:
141
- lines.append(f"assistant: {assistant_msg}")
142
 
143
  return "\n".join(lines)
144
 
145
 
146
- def answer_question(query, chat_history):
147
- if chat_history is None:
148
- chat_history = [[None, WELCOME_MESSAGE]]
149
-
150
- if not query or not query.strip():
151
- return "", chat_history
152
 
153
- query = query.strip()
 
154
 
155
  try:
156
- retrieved_docs = retriever.invoke(query)
 
157
 
158
- context = "\n\n".join(
159
- doc.page_content for doc in retrieved_docs if doc.page_content
160
- )
161
-
162
- history_text = format_chat_history(chat_history)
163
 
164
  system_prompt = """
165
  You are Thierry Decae's recruitment chatbot.
166
 
167
- Your role is to answer questions about Thierry Decae's career, experience,
168
- skills, projects, availability, work eligibility, and professional background.
169
-
170
- Use only the provided context when answering factual questions.
171
- If the answer is not available in the context, say: "I'm not sure about that."
172
-
173
- Always answer as Thierry, using "I", "my", and "me".
174
- Do not refer to Thierry as "he".
175
- Be professional, concise, and helpful.
176
- You may answer in the same language as the user.
177
  """
178
 
179
  user_prompt = f"""
180
- Conversation history:
181
  {history_text}
182
 
183
- Context from Thierry's documents:
184
  {context}
185
 
186
- User question:
187
  {query}
188
 
189
  Answer:
190
  """
191
 
192
- answer = call_deepseek(
193
- messages=[
194
- {"role": "system", "content": system_prompt},
195
- {"role": "user", "content": user_prompt},
196
- ]
197
- )
198
 
199
  except Exception as e:
200
- print(f"Error while answering: {e}", flush=True)
201
- answer = (
202
- "Sorry, I ran into an error while answering. "
203
- "Please try again in a moment."
204
- )
205
 
206
- chat_history.append([query, answer])
 
207
 
208
- return "", chat_history
209
 
210
 
211
  def clear_chat():
212
- return [[None, WELCOME_MESSAGE]]
213
 
214
 
215
- avatar_images = None
216
-
 
217
  guest_img = os.path.join(DOCS_DIR, "Guest.jpg")
218
  thierry_img = os.path.join(DOCS_DIR, "Thierry Picture.jpg")
219
 
 
220
  if os.path.exists(guest_img) and os.path.exists(thierry_img):
221
- avatar_images = [guest_img, thierry_img]
222
 
223
 
224
  with gr.Blocks() as demo:
225
  gr.Markdown("# Thierry Decae Recruitment Chatbot")
226
 
227
  chatbot = gr.Chatbot(
228
- value=[[None, WELCOME_MESSAGE]],
229
- avatar_images=avatar_images,
230
  height=500,
231
  )
232
 
233
- msg = gr.Textbox(
234
- placeholder="Ask a recruitment-related question...",
235
- label="Your question",
236
- )
237
 
238
- clear = gr.Button("Clear chat")
239
 
240
- msg.submit(
241
- answer_question,
242
- inputs=[msg, chatbot],
243
- outputs=[msg, chatbot],
244
- )
245
-
246
- clear.click(
247
- clear_chat,
248
- inputs=None,
249
- outputs=chatbot,
250
- )
251
 
252
 
253
  demo.launch(
 
2
  import sys
3
  import requests
4
 
5
+ # SQLite workaround (needed for Chroma on HF Spaces)
6
  try:
7
  __import__("pysqlite3")
8
  sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
 
17
  from langchain_huggingface import HuggingFaceEmbeddings
18
 
19
 
20
+ # ========================
21
+ # CONFIG
22
+ # ========================
23
  DOCS_DIR = "multiple_docs"
24
  DB_DIR = "./db"
 
25
 
26
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
27
  DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
 
33
  )
34
 
35
 
36
+ # ========================
37
+ # DEEPSEEK CALL
38
+ # ========================
39
+ def call_deepseek(messages):
40
  if not DEEPSEEK_API_KEY:
41
+ return "Missing DEEPSEEK_API_KEY."
42
 
43
  headers = {
44
  "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
 
48
  payload = {
49
  "model": "deepseek-chat",
50
  "messages": messages,
51
+ "temperature": 0.4,
52
+ "max_tokens": 700,
53
  }
54
 
55
+ response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload, timeout=60)
 
 
 
 
 
56
  response.raise_for_status()
57
 
58
+ return response.json()["choices"][0]["message"]["content"].strip()
 
59
 
60
 
61
+ # ========================
62
+ # LOAD DOCS
63
+ # ========================
64
  def load_documents():
 
 
 
65
  docs = []
66
 
67
+ for f in os.listdir(DOCS_DIR):
68
+ path = os.path.join(DOCS_DIR, f)
 
69
 
70
  try:
71
+ if f.endswith(".pdf"):
72
+ docs.extend(PyPDFLoader(path).load())
73
+ elif f.endswith(".docx"):
74
+ docs.extend(Docx2txtLoader(path).load())
75
+ elif f.endswith(".txt"):
76
+ docs.extend(TextLoader(path, encoding="utf-8").load())
 
 
 
 
 
 
77
  except Exception as e:
78
+ print(f"Error loading {f}: {e}", flush=True)
79
 
80
  if not docs:
81
+ raise ValueError("No documents found")
 
 
 
 
 
82
 
83
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
84
  return splitter.split_documents(docs)
85
 
86
 
87
+ # ========================
88
+ # VECTORSTORE
89
+ # ========================
90
  def build_vectorstore():
91
+ print("Loading embeddings...", flush=True)
92
 
93
+ embedding = HuggingFaceEmbeddings(
94
  model_name="sentence-transformers/all-MiniLM-L6-v2"
95
  )
96
 
 
97
  docs = load_documents()
98
+ print(f"Loaded {len(docs)} chunks", flush=True)
99
 
100
+ return Chroma.from_documents(
101
+ docs,
102
+ embedding,
 
 
103
  persist_directory=DB_DIR,
 
104
  )
105
 
 
 
106
 
107
  vectorstore = build_vectorstore()
108
  retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
109
 
110
 
111
+ # ========================
112
+ # HISTORY FORMAT
113
+ # ========================
114
+ def format_history(history):
115
  if not history:
116
  return ""
117
 
118
  lines = []
119
+ for msg in history[-8:]:
120
+ role = msg.get("role")
121
+ content = msg.get("content")
122
+ if role and content:
123
+ lines.append(f"{role}: {content}")
 
 
 
 
 
 
 
124
 
125
  return "\n".join(lines)
126
 
127
 
128
+ # ========================
129
+ # MAIN QA FUNCTION
130
+ # ========================
131
+ def answer_question(query, history):
132
+ if history is None:
133
+ history = [{"role": "assistant", "content": WELCOME_MESSAGE}]
134
 
135
+ if not query.strip():
136
+ return "", history
137
 
138
  try:
139
+ docs = retriever.invoke(query)
140
+ context = "\n\n".join(d.page_content for d in docs if d.page_content)
141
 
142
+ history_text = format_history(history)
 
 
 
 
143
 
144
  system_prompt = """
145
  You are Thierry Decae's recruitment chatbot.
146
 
147
+ Answer questions about Thierry's experience, skills, and career.
148
+ Use only provided context.
149
+ If unsure, say "I'm not sure about that."
150
+ Always answer as Thierry ("I", "my").
 
 
 
 
 
 
151
  """
152
 
153
  user_prompt = f"""
154
+ Conversation:
155
  {history_text}
156
 
157
+ Context:
158
  {context}
159
 
160
+ Question:
161
  {query}
162
 
163
  Answer:
164
  """
165
 
166
+ answer = call_deepseek([
167
+ {"role": "system", "content": system_prompt},
168
+ {"role": "user", "content": user_prompt},
169
+ ])
 
 
170
 
171
  except Exception as e:
172
+ print(e, flush=True)
173
+ answer = "Error while answering."
 
 
 
174
 
175
+ history.append({"role": "user", "content": query})
176
+ history.append({"role": "assistant", "content": answer})
177
 
178
+ return "", history
179
 
180
 
181
  def clear_chat():
182
+ return [{"role": "assistant", "content": WELCOME_MESSAGE}]
183
 
184
 
185
+ # ========================
186
+ # UI
187
+ # ========================
188
  guest_img = os.path.join(DOCS_DIR, "Guest.jpg")
189
  thierry_img = os.path.join(DOCS_DIR, "Thierry Picture.jpg")
190
 
191
+ avatars = None
192
  if os.path.exists(guest_img) and os.path.exists(thierry_img):
193
+ avatars = [guest_img, thierry_img]
194
 
195
 
196
  with gr.Blocks() as demo:
197
  gr.Markdown("# Thierry Decae Recruitment Chatbot")
198
 
199
  chatbot = gr.Chatbot(
200
+ value=[{"role": "assistant", "content": WELCOME_MESSAGE}],
201
+ avatar_images=avatars,
202
  height=500,
203
  )
204
 
205
+ msg = gr.Textbox(placeholder="Ask a question...")
 
 
 
206
 
207
+ clear = gr.Button("Clear")
208
 
209
+ msg.submit(answer_question, [msg, chatbot], [msg, chatbot])
210
+ clear.click(clear_chat, None, chatbot)
 
 
 
 
 
 
 
 
 
211
 
212
 
213
  demo.launch(