tdecae commited on
Commit
9402eba
·
verified ·
1 Parent(s): acc088f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -244
app.py CHANGED
@@ -1,275 +1,243 @@
1
- # import os
2
- # import sys
3
- # import requests
4
- # from langchain.chains import ConversationalRetrievalChain
5
- # from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
6
- # from langchain.text_splitter import CharacterTextSplitter
7
- # from langchain.vectorstores import Chroma
8
- # from langchain.embeddings import HuggingFaceEmbeddings
9
- # from langchain.llms.base import LLM
10
- # import gradio as gr
11
-
12
- # # workaround for sqlite in HF spaces
13
- # __import__('pysqlite3')
14
- # sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
15
-
16
- # # 📄 Load documents
17
- # docs = []
18
- # for f in os.listdir("multiple_docs"):
19
- # if f.endswith(".pdf"):
20
- # loader = PyPDFLoader(os.path.join("multiple_docs", f))
21
- # docs.extend(loader.load())
22
- # elif f.endswith(".docx") or f.endswith(".doc"):
23
- # loader = Docx2txtLoader(os.path.join("multiple_docs", f))
24
- # docs.extend(loader.load())
25
- # elif f.endswith(".txt"):
26
- # loader = TextLoader(os.path.join("multiple_docs", f))
27
- # docs.extend(loader.load())
28
-
29
- # # 🔗 Split into chunks
30
- # splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
31
- # docs = splitter.split_documents(docs)
32
-
33
- # texts = [doc.page_content for doc in docs]
34
- # metadatas = [{"id": i} for i in range(len(texts))]
35
-
36
- # # 🧠 Embeddings
37
- # embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
38
-
39
- # # 🗃️ Vectorstore
40
- # vectorstore = Chroma(
41
- # persist_directory="./db",
42
- # embedding_function=embedding_function
43
- # )
44
- # vectorstore.add_texts(texts=texts, metadatas=metadatas)
45
- # vectorstore.persist()
46
-
47
- # # 🔐 Get DeepSeek API key from env
48
- # DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
49
- # if DEEPSEEK_API_KEY is None:
50
- # raise ValueError("DEEPSEEK_API_KEY environment variable is not set.")
51
-
52
- # # 🌟 DeepSeek API endpoint
53
- # DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
54
-
55
- # # 🔷 Wrap DeepSeek API into LangChain LLM
56
- # class DeepSeekLLM(LLM):
57
- # """LLM that queries DeepSeek's API."""
58
- # api_key: str = DEEPSEEK_API_KEY
59
-
60
- # def _call(self, prompt, stop=None, run_manager=None, **kwargs):
61
- # headers = {
62
- # "Authorization": f"Bearer {self.api_key}",
63
- # "Content-Type": "application/json"
64
- # }
65
- # payload = {
66
- # "model": "deepseek-chat", # adjust if you have a specific model name
67
- # "messages": [
68
- # {"role": "system", "content": "You are a helpful assistant."},
69
- # {"role": "user", "content": prompt}
70
- # ],
71
- # "temperature": 0.7,
72
- # "max_tokens": 512
73
- # }
74
- # response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
75
- # response.raise_for_status()
76
- # data = response.json()
77
- # return data["choices"][0]["message"]["content"].strip()
78
-
79
- # @property
80
- # def _llm_type(self) -> str:
81
- # return "deepseek_api"
82
-
83
- # llm = DeepSeekLLM()
84
-
85
- # # 🔗 Conversational chain
86
- # chain = ConversationalRetrievalChain.from_llm(
87
- # llm,
88
- # retriever=vectorstore.as_retriever(search_kwargs={'k': 6}),
89
- # return_source_documents=True,
90
- # verbose=False
91
- # )
92
-
93
- # # 💬 Gradio UI
94
- # chat_history = []
95
-
96
- # with gr.Blocks() as demo:
97
- # chatbot = gr.Chatbot(
98
- # [("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my experience, where I'm eligible to work, skills etc you can chat with me directly in multiple languages")],
99
- # avatar_images=["./multiple_docs/Guest.jpg", "./multiple_docs/Thierry Picture.jpg"]
100
- # )
101
- # msg = gr.Textbox(placeholder="Type your question here...")
102
- # clear = gr.Button("Clear")
103
-
104
- # def user(query, chat_history):
105
- # chat_history_tuples = [(m[0], m[1]) for m in chat_history]
106
- # result = chain({"question": query, "chat_history": chat_history_tuples})
107
- # chat_history.append((query, result["answer"]))
108
- # return gr.update(value=""), chat_history
109
-
110
- # msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
111
- # clear.click(lambda: None, None, chatbot, queue=False)
112
-
113
- # demo.launch(debug=True) # remove share=True if running in HF Spaces
114
-
115
-
116
  import os
117
  import sys
118
  import requests
119
- from langchain.chains import ConversationalRetrievalChain, LLMChain
120
- from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
121
- from langchain.text_splitter import CharacterTextSplitter
122
- from langchain.vectorstores import Chroma
123
- from langchain.embeddings import HuggingFaceEmbeddings
124
- from langchain.llms.base import LLM
125
- from langchain.prompts import PromptTemplate
126
- from langchain.chains.question_answering import load_qa_chain
127
  import gradio as gr
128
 
129
- # workaround for sqlite in HF spaces
130
- __import__('pysqlite3')
131
- sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
132
-
133
- # 📄 Load documents
134
- docs = []
135
- for f in os.listdir("multiple_docs"):
136
- if f.endswith(".pdf"):
137
- loader = PyPDFLoader(os.path.join("multiple_docs", f))
138
- docs.extend(loader.load())
139
- elif f.endswith(".docx") or f.endswith(".doc"):
140
- loader = Docx2txtLoader(os.path.join("multiple_docs", f))
141
- docs.extend(loader.load())
142
- elif f.endswith(".txt"):
143
- loader = TextLoader(os.path.join("multiple_docs", f))
144
- docs.extend(loader.load())
145
-
146
- # 🔗 Split into chunks
147
- splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
148
- docs = splitter.split_documents(docs)
149
-
150
- texts = [doc.page_content for doc in docs]
151
- metadatas = [{"id": i} for i in range(len(texts))]
152
-
153
- # 🧠 Embeddings
154
- embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
155
-
156
- # 🗃️ Vectorstore
157
- vectorstore = Chroma(
158
- persist_directory="./db",
159
- embedding_function=embedding_function
160
- )
161
- vectorstore.add_texts(texts=texts, metadatas=metadatas)
162
- vectorstore.persist()
163
 
164
- # 🔐 Get DeepSeek API key from env
165
- DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
166
- if DEEPSEEK_API_KEY is None:
167
- raise ValueError("DEEPSEEK_API_KEY environment variable is not set.")
168
 
169
- # 🌟 DeepSeek API endpoint
 
 
 
 
170
  DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
171
 
172
- # 🔷 Wrap DeepSeek API into LangChain LLM
173
- class DeepSeekLLM(LLM):
174
- """LLM that queries DeepSeek's API."""
175
- api_key: str = DEEPSEEK_API_KEY
176
-
177
- def _call(self, prompt, stop=None, run_manager=None, **kwargs):
178
- headers = {
179
- "Authorization": f"Bearer {self.api_key}",
180
- "Content-Type": "application/json"
181
- }
182
- payload = {
183
- "model": "deepseek-chat", # adjust if you have a specific model name
184
- "messages": [
185
- {"role": "system", "content": "You are a helpful assistant."},
186
- {"role": "user", "content": prompt}
187
- ],
188
- "temperature": 0.7,
189
- "max_tokens": 512
190
- }
191
- response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
192
- response.raise_for_status()
193
- data = response.json()
194
- return data["choices"][0]["message"]["content"].strip()
195
-
196
- @property
197
- def _llm_type(self) -> str:
198
- return "deepseek_api"
199
-
200
- llm = DeepSeekLLM()
201
-
202
- # Custom prompt template
203
- template = """
204
- You are Thierry Decae's chatbot. Your role is to answer questions about his career, experience, availability — in other words
205
- any recruitment-related question.
206
- Use the following context to answer the user's question as fully and accurately as possible.
207
- If you don't know the answer, say "I'm not sure about that."
208
- Always answer as if you were Thierry Decae — do not refer to him as 'he', use 'I' instead.
209
-
210
- Context:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  {context}
212
 
213
- Question: {question}
 
214
 
215
  Answer:
216
  """
217
 
218
- prompt = PromptTemplate(
219
- input_variables=["context", "question"],
220
- template=template,
221
- )
 
 
222
 
223
- # 🔗 QA chain with custom prompt
224
- qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
 
 
 
 
225
 
226
- # 🔷 Question rephraser chain for follow-up questions → standalone
227
- CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(
228
- """
229
- Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question.
230
 
231
- Chat History:
232
- {chat_history}
233
- Follow Up Input: {question}
234
- Standalone question:
235
- """
236
- )
237
 
238
- question_generator = LLMChain(
239
- llm=llm,
240
- prompt=CONDENSE_QUESTION_PROMPT
241
- )
242
 
243
- # 🔷 Finally: build the ConversationalRetrievalChain manually
244
- chain = ConversationalRetrievalChain(
245
- retriever=vectorstore.as_retriever(search_kwargs={'k': 6}),
246
- question_generator=question_generator,
247
- combine_docs_chain=qa_chain,
248
- return_source_documents=True,
249
- verbose=False
250
- )
 
 
251
 
252
- # 💬 Gradio UI
253
- chat_history = []
254
 
255
  with gr.Blocks() as demo:
 
 
256
  chatbot = gr.Chatbot(
257
- [("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my experience, where I'm eligible to work, skills etc. You can chat with me directly in multiple languages.")],
258
- avatar_images=["./multiple_docs/Guest.jpg", "./multiple_docs/Thierry Picture.jpg"]
 
 
259
  )
260
- msg = gr.Textbox(placeholder="Type your question here...")
261
- clear = gr.Button("Clear")
262
 
263
- def user(query, chat_history):
264
- chat_history_tuples = [(m[0], m[1]) for m in chat_history]
265
- result = chain({"question": query, "chat_history": chat_history_tuples})
266
- chat_history.append((query, result["answer"]))
267
- return gr.update(value=""), chat_history
268
 
269
- msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
270
- clear.click(lambda: None, None, chatbot, queue=False)
271
 
272
- demo.launch(debug=True) # remove share=True if running in HF Spaces
 
 
 
 
273
 
 
 
 
 
 
274
 
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import sys
3
  import requests
4
+
5
+ # SQLite workaround for Chroma on Hugging Face Spaces
6
+ try:
7
+ __import__("pysqlite3")
8
+ sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
9
+ except Exception:
10
+ pass
11
+
12
  import gradio as gr
13
 
14
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
15
+ from langchain_text_splitters import CharacterTextSplitter
16
+ from langchain_chroma import Chroma
17
+ from langchain_huggingface import HuggingFaceEmbeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
 
 
 
 
19
 
20
+ DOCS_DIR = "multiple_docs"
21
+ DB_DIR = "./db"
22
+ COLLECTION_NAME = "thierry_recruitment_docs"
23
+
24
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
25
  DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
26
 
27
+ WELCOME_MESSAGE = (
28
+ "Hello, I'm Thierry Decae's chatbot. You can ask me recruitment-related "
29
+ "questions about my experience, skills, availability, work eligibility, "
30
+ "projects, and background. You can chat with me in multiple languages."
31
+ )
32
+
33
+
34
+ def call_deepseek(messages, temperature=0.4, max_tokens=700):
35
+ if not DEEPSEEK_API_KEY:
36
+ return "DEEPSEEK_API_KEY is not set in the Hugging Face Space secrets."
37
+
38
+ headers = {
39
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
40
+ "Content-Type": "application/json",
41
+ }
42
+
43
+ payload = {
44
+ "model": "deepseek-chat",
45
+ "messages": messages,
46
+ "temperature": temperature,
47
+ "max_tokens": max_tokens,
48
+ }
49
+
50
+ response = requests.post(
51
+ DEEPSEEK_API_URL,
52
+ headers=headers,
53
+ json=payload,
54
+ timeout=60,
55
+ )
56
+ response.raise_for_status()
57
+ data = response.json()
58
+ return data["choices"][0]["message"]["content"].strip()
59
+
60
+
61
+ def load_documents():
62
+ if not os.path.exists(DOCS_DIR):
63
+ raise FileNotFoundError(f"Folder not found: {DOCS_DIR}")
64
+
65
+ docs = []
66
+
67
+ for filename in os.listdir(DOCS_DIR):
68
+ path = os.path.join(DOCS_DIR, filename)
69
+ lower = filename.lower()
70
+
71
+ try:
72
+ if lower.endswith(".pdf"):
73
+ loader = PyPDFLoader(path)
74
+ docs.extend(loader.load())
75
+ elif lower.endswith(".docx") or lower.endswith(".doc"):
76
+ loader = Docx2txtLoader(path)
77
+ docs.extend(loader.load())
78
+ elif lower.endswith(".txt"):
79
+ loader = TextLoader(path, encoding="utf-8")
80
+ docs.extend(loader.load())
81
+ except Exception as e:
82
+ print(f"Could not load {filename}: {e}", flush=True)
83
+
84
+ if not docs:
85
+ raise ValueError(f"No documents found in {DOCS_DIR}")
86
+
87
+ splitter = CharacterTextSplitter(
88
+ chunk_size=1000,
89
+ chunk_overlap=100,
90
+ )
91
+
92
+ return splitter.split_documents(docs)
93
+
94
+
95
+ def build_vectorstore():
96
+ print("Loading embedding model...", flush=True)
97
+
98
+ embedding_function = HuggingFaceEmbeddings(
99
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
100
+ )
101
+
102
+ print("Loading documents...", flush=True)
103
+ docs = load_documents()
104
+
105
+ print(f"Loaded {len(docs)} document chunks.", flush=True)
106
+
107
+ vectorstore = Chroma.from_documents(
108
+ documents=docs,
109
+ embedding=embedding_function,
110
+ persist_directory=DB_DIR,
111
+ collection_name=COLLECTION_NAME,
112
+ )
113
+
114
+ return vectorstore
115
+
116
+
117
+ vectorstore = build_vectorstore()
118
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
119
+
120
+
121
+ def format_chat_history(history):
122
+ if not history:
123
+ return ""
124
+
125
+ lines = []
126
+ for message in history[-8:]:
127
+ role = message.get("role", "")
128
+ content = message.get("content", "")
129
+ if role and content:
130
+ lines.append(f"{role}: {content}")
131
+
132
+ return "\n".join(lines)
133
+
134
+
135
+ def answer_question(query, chat_history):
136
+ if not query or not query.strip():
137
+ return "", chat_history
138
+
139
+ query = query.strip()
140
+
141
+ try:
142
+ retrieved_docs = retriever.invoke(query)
143
+
144
+ context = "\n\n".join(
145
+ doc.page_content for doc in retrieved_docs if doc.page_content
146
+ )
147
+
148
+ history_text = format_chat_history(chat_history)
149
+
150
+ system_prompt = """
151
+ You are Thierry Decae's recruitment chatbot.
152
+
153
+ Your role is to answer questions about Thierry Decae's career, experience,
154
+ skills, projects, availability, work eligibility, and professional background.
155
+
156
+ Use only the provided context when answering factual questions.
157
+ If the answer is not available in the context, say: "I'm not sure about that."
158
+
159
+ Always answer as Thierry, using "I", "my", and "me".
160
+ Do not refer to Thierry as "he".
161
+ Be professional, concise, and helpful.
162
+ You may answer in the same language as the user.
163
+ """
164
+
165
+ user_prompt = f"""
166
+ Conversation history:
167
+ {history_text}
168
+
169
+ Context from Thierry's documents:
170
  {context}
171
 
172
+ User question:
173
+ {query}
174
 
175
  Answer:
176
  """
177
 
178
+ answer = call_deepseek(
179
+ messages=[
180
+ {"role": "system", "content": system_prompt},
181
+ {"role": "user", "content": user_prompt},
182
+ ]
183
+ )
184
 
185
+ except Exception as e:
186
+ print(f"Error while answering: {e}", flush=True)
187
+ answer = (
188
+ "Sorry, I ran into an error while answering. "
189
+ "Please try again in a moment."
190
+ )
191
 
192
+ chat_history.append({"role": "user", "content": query})
193
+ chat_history.append({"role": "assistant", "content": answer})
 
 
194
 
195
+ return "", chat_history
 
 
 
 
 
196
 
 
 
 
 
197
 
198
+ def clear_chat():
199
+ return [{"role": "assistant", "content": WELCOME_MESSAGE}]
200
+
201
+
202
+ avatar_images = None
203
+ guest_img = os.path.join(DOCS_DIR, "Guest.jpg")
204
+ thierry_img = os.path.join(DOCS_DIR, "Thierry Picture.jpg")
205
+
206
+ if os.path.exists(guest_img) and os.path.exists(thierry_img):
207
+ avatar_images = [guest_img, thierry_img]
208
 
 
 
209
 
210
  with gr.Blocks() as demo:
211
+ gr.Markdown("# Thierry Decae Recruitment Chatbot")
212
+
213
  chatbot = gr.Chatbot(
214
+ value=[{"role": "assistant", "content": WELCOME_MESSAGE}],
215
+ type="messages",
216
+ avatar_images=avatar_images,
217
+ height=500,
218
  )
 
 
219
 
220
+ msg = gr.Textbox(
221
+ placeholder="Ask a recruitment-related question...",
222
+ label="Your question",
223
+ )
 
224
 
225
+ clear = gr.Button("Clear chat")
 
226
 
227
+ msg.submit(
228
+ answer_question,
229
+ inputs=[msg, chatbot],
230
+ outputs=[msg, chatbot],
231
+ )
232
 
233
+ clear.click(
234
+ clear_chat,
235
+ inputs=None,
236
+ outputs=chatbot,
237
+ )
238
 
239
 
240
+ demo.launch(
241
+ server_name="0.0.0.0",
242
+ server_port=int(os.getenv("PORT", 7860)),
243
+ )