Ephraimmm commited on
Commit
06e6988
·
verified ·
1 Parent(s): 490d2bd

Upload 16 files

Browse files
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ import numpy as np
4
+ import gradio as gr
5
+
6
+ from openai import AsyncOpenAI
7
+ from langsmith import traceable
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ from src.prompts import system_prompt
11
+ # from src.name_extractor import extract_name_gliner
12
+ from src.models import CacheEntry
13
+ from src.config import Config
14
+ from src.utils import FileReader
15
+
16
+ # ---------------------------------------------------------------------
17
+ # CHAT CLASS
18
+ # ---------------------------------------------------------------------
19
+ class MyProfileAvatarChat(Config, FileReader):
20
+ def __init__(self, max_history_turns: int = 10, similarity_thresh: float = 0.80):
21
+ Config.__init__(self)
22
+ FileReader.__init__(self)
23
+
24
+ # 1. Try to load from env
25
+ self.name = os.getenv("PROFIL_NAME")
26
+ # if not self.name:
27
+ # name = extract_name_gliner(self.linkedin_profile)
28
+ # self.name = name["person"][0]
29
+ # print(f"Name found on Linkedin profile: {self.name}")
30
+
31
+ self.openai = AsyncOpenAI(api_key=self.openai_api_key)
32
+
33
+ # Build system prompt once
34
+ self.system_prompt = system_prompt
35
+ self.system_prompt += f"## Linkedin Profile:\n{self.linkedin_profile}\n\n"
36
+ self.system_prompt += f"## Addidional Information:\n{self.additional_info}\n\n"
37
+ self.system_prompt += f"With this context, please chat with user, always staying in character as {self.name}."
38
+
39
+ # Settings
40
+ self.max_history_turns = max_history_turns
41
+ self.similarity_threshold = similarity_thresh
42
+
43
+ # QA cache (question -> answer -> embedding)
44
+ self.qa_cache = [] # list of dict: {"question": str, "answer": str, "embedding": np.array}
45
+
46
+
47
+ def format_history(self, history):
48
+ return "\n".join(f"{turn['role'].upper()}: {turn['content']}" for turn in history)
49
+
50
+ async def embed(self, text: str):
51
+ """Return embedding vector for text (uses OpenAI embeddings)."""
52
+ resp = await self.openai.embeddings.create(
53
+ model="text-embedding-3-small",
54
+ input=text
55
+ )
56
+ return np.array(resp.data[0].embedding)
57
+
58
+ def cosine_sim(self, a: np.ndarray, b: np.ndarray) -> float:
59
+ return float(cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0])
60
+
61
+ async def find_similar_question(self, new_question: str):
62
+ if not self.qa_cache:
63
+ return None, 0.0
64
+ new_emb = await self.embed(new_question)
65
+ best = None
66
+ best_sim = 0.0
67
+ for item in self.qa_cache:
68
+ sim = self.cosine_sim(new_emb, item["embedding"])
69
+ if sim > best_sim:
70
+ best_sim = sim
71
+ best = item
72
+ if best and best_sim >= self.similarity_threshold:
73
+ return best, best_sim
74
+ return None, best_sim
75
+
76
+ async def chat(self, message: str, history: list, **kwargs):
77
+ """Main chat. Uses semantic QA cache and sliding window for tokens
78
+
79
+ Args:
80
+ message: user message string
81
+ history: existing list of dicts [{"role":...., "content":....}]
82
+ Returns:
83
+ reply string
84
+ """
85
+ # Cache exact-match short-circuit
86
+ if message in (qa["question"] for qa in self.qa_cache):
87
+ # exact match
88
+ for qa in self.qa_cache:
89
+ if qa["question"] == message:
90
+ print("Using exact cached reply")
91
+ history.append({"role": "user", "content": message})
92
+ history.append({"role": "assistant", "content": qa["answer"]})
93
+ return qa["answer"]
94
+
95
+ # Check for semantically similar previous question
96
+ similar, sim_score = await self.find_similar_question(message)
97
+ if similar:
98
+ print(f"Reusing past answer (similarity={sim_score:.2%})")
99
+ refine_prompt = (
100
+ f"The user previously asked a similar question:\n"
101
+ + f"Old question: {similar['question']}\n"
102
+ + f"Old answer: {similar['answer']}\n\n"
103
+ + f"Now user asks: {message}\n\n"
104
+ + f"Please update or refine the old answer to match the new question."
105
+ )
106
+ messages = [{"role": "system", "content": self.system_prompt},
107
+ {"role": "user", "content": refine_prompt}]
108
+ try:
109
+ response = await self.openai.chat.completions.create(
110
+ model="gpt-4o-mini",
111
+ messages=messages
112
+ )
113
+ reply = response.choices[0].message.content
114
+ except Exception as e:
115
+ print(f"Error calling OpenAI for refinement: {e}")
116
+ reply = similar["answer"]
117
+ else:
118
+ # Build token-efficent context (sliding window)
119
+ temp_history = history + [{"role": "user", "content": message}]
120
+ context_for_api = temp_history[-self.max_history_turns:]
121
+ messages = [{"role": "system", "content": self.system_prompt}] + context_for_api
122
+
123
+ try:
124
+ response = await self.openai.chat.completions.create(
125
+ model="gpt-4o-mini",
126
+ messages=messages
127
+ )
128
+ reply = response.choices[0].message.content
129
+ except Exception as e:
130
+ print(f"Error calling OpenAI: {e}")
131
+
132
+ try:
133
+ emb = await self.embed(message)
134
+ except Exception as e:
135
+ print(f"Embedding Error: {e}")
136
+ traceback.print_exc()
137
+ emb = None
138
+
139
+ self.qa_cache.append({
140
+ "question":message,
141
+ "answer":reply,
142
+ "embedding":emb
143
+ })
144
+
145
+ return reply
146
+
147
+ @traceable(run_type="chain", name="ProfileChat")
148
+ async def chat_traced(self, *args, **kwargs):
149
+ """Wrapper for LangSmith tracing. Accepts any extra arguments
150
+ (like from Gradio) and passes only message/history to chat()."""
151
+
152
+ if len(args) >=2:
153
+ message, history = args[0], args[1]
154
+ else:
155
+ message = kwargs.get("message")
156
+ history = kwargs.get("history")
157
+ return await self.chat(message, history)
158
+
159
+ if __name__ == "__main__":
160
+
161
+ my_profile = MyProfileAvatarChat()
162
+ with gr.Blocks() as demo:
163
+ # Per-user chat history state
164
+ state = gr.State([])
165
+
166
+ # Chat interface
167
+ chat = gr.ChatInterface(
168
+ my_profile.chat_traced
169
+ )
170
+
171
+ demo.queue(max_size=10).launch(
172
+ server_name="0.0.0.0",
173
+ show_error=8000,
174
+ share=True
175
+ )
176
+
177
+
me/Linkedin_Profile.pdf ADDED
Binary file (68 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ gradio>=5.22.0
3
+ langsmith>=0.3.18
4
+ openai>=1.68.2
5
+ pypdf>=5.4.0
6
+ python-dotenv>=1.0.1
7
+ requests>=2.32.3
8
+ setuptools>=78.1.0
9
+ scikit-learn>=1.7.2
10
+ # gliner2==1.0.2
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (151 Bytes). View file
 
src/__pycache__/config.cpython-312.pyc ADDED
Binary file (1.05 kB). View file
 
src/__pycache__/file_loader.cpython-312.pyc ADDED
Binary file (1.12 kB). View file
 
src/__pycache__/models.cpython-312.pyc ADDED
Binary file (492 Bytes). View file
 
src/__pycache__/prompts.cpython-312.pyc ADDED
Binary file (1.09 kB). View file
 
src/__pycache__/utils.cpython-312.pyc ADDED
Binary file (711 Bytes). View file
 
src/config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ from langsmith import Client, traceable
5
+
6
+ # ---------------------------------------------------------------------
7
+ # CONFIG
8
+ # ---------------------------------------------------------------------
9
+ class Config:
10
+ def __init__(self):
11
+ load_dotenv(override=True)
12
+ self.openai_api_key = os.getenv("OPENAI_API_KEY")
13
+ self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
14
+ self.langsmith_endpoint = os.getenv("LANGSMITH_ENDPOINT")
15
+
16
+ # Initialize LangSmith
17
+ self.langsmith_client = Client(api_key=self.langsmith_api_key)
src/file_loader.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from pypdf import PdfReader
3
+
4
+ @lru_cache()
5
+ def load_pdf_text(path: str) -> str:
6
+ text = ""
7
+ try:
8
+ reader = PdfReader(path)
9
+ for page in reader.pages:
10
+ page_text = page.extract_text()
11
+ if page_text:
12
+ text += page_text
13
+ except:
14
+ return ""
15
+ return text
16
+
17
+ @lru_cache()
18
+ def load_text_file(path: str) -> str:
19
+ try:
20
+ with open(path, "r", encoding="utf-8") as f:
21
+ return f.read()
22
+ except:
23
+ return ""
src/models.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
+ class CacheEntry(BaseModel):
4
+ question: str
5
+ answer: str
6
+ embedding: list[float]
src/name_extractor.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from gliner2 import GLiNER2
2
+
3
+ def extract_name_gliner(text: str) -> str:
4
+ extractor = GLiNER2.from_pretrained("fastino/gliner2-base-v1")
5
+ result = extractor.extract_entities(text[:700], ["person"])
6
+ return result["entities"]
src/prompts.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv(override=True)
5
+
6
+ name = os.getenv("PROFIL_NAME")
7
+
8
+ system_prompt = f"You are acting as {name}. You are answering question on {name}'s work experience and Life , \
9
+ particularly question related to {name}'s career, background, skills and experience. \
10
+ Your responsibility is to represent {name} for interactions on the website as faithfully as possible. \
11
+ Be professional and engaging, as if talking to a potential client or future employer who came across the website. \
12
+ Do not answer any questions which are not related to {name} porfolio. \
13
+ If you do not know the answer, say so and ask for contact to better answer questions agant cannot. \
14
+ If you need to check e.g salary expectation question then use tools to see what range for such position is. \
15
+ "
16
+ # When asked about professional experience, focus primarily on your data scientist experience. You may briefly mention past roles (e.g., Tesco, education) and acknowledge that your career path hasn't been linear, but emphasize that this variety has given you a broader perspective and valuable transferable skills. \
17
+ # Whenever appropriate, invite the person to contact you via email if they have further questions or would like to arrange a conversation.
18
+ # If you don't know the answer, state that clearly and honestly. \
19
+ # Don't use technologies if I do not have experience as Data Scientist e.g. R language - you never had experience with it.
src/utils.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.file_loader import load_pdf_text, load_text_file
2
+
3
+ # ---------------------------------------------------------------------
4
+ # FILE READER
5
+ # ---------------------------------------------------------------------
6
+ class FileReader:
7
+ def __init__(self):
8
+ self.linkedin_profile = load_pdf_text("./me/Linkedin_Profile.pdf")
9
+ self.additional_info = load_text_file("./me/additional_info.txt")
10
+
11
+ # print("=== LINKEDIN PROFILE CONTENT ===")
12
+ # print(self.linkedin_profile)
13
+ # print("=== END ===")