Spaces:
Running
Running
Upload 16 files
Browse files- app.py +177 -0
- me/Linkedin_Profile.pdf +0 -0
- requirements.txt +10 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-312.pyc +0 -0
- src/__pycache__/config.cpython-312.pyc +0 -0
- src/__pycache__/file_loader.cpython-312.pyc +0 -0
- src/__pycache__/models.cpython-312.pyc +0 -0
- src/__pycache__/prompts.cpython-312.pyc +0 -0
- src/__pycache__/utils.cpython-312.pyc +0 -0
- src/config.py +17 -0
- src/file_loader.py +23 -0
- src/models.py +6 -0
- src/name_extractor.py +6 -0
- src/prompts.py +19 -0
- src/utils.py +13 -0
app.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import traceback
|
| 3 |
+
import numpy as np
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
from openai import AsyncOpenAI
|
| 7 |
+
from langsmith import traceable
|
| 8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
+
|
| 10 |
+
from src.prompts import system_prompt
|
| 11 |
+
# from src.name_extractor import extract_name_gliner
|
| 12 |
+
from src.models import CacheEntry
|
| 13 |
+
from src.config import Config
|
| 14 |
+
from src.utils import FileReader
|
| 15 |
+
|
| 16 |
+
# ---------------------------------------------------------------------
|
| 17 |
+
# CHAT CLASS
|
| 18 |
+
# ---------------------------------------------------------------------
|
| 19 |
+
class MyProfileAvatarChat(Config, FileReader):
|
| 20 |
+
def __init__(self, max_history_turns: int = 10, similarity_thresh: float = 0.80):
|
| 21 |
+
Config.__init__(self)
|
| 22 |
+
FileReader.__init__(self)
|
| 23 |
+
|
| 24 |
+
# 1. Try to load from env
|
| 25 |
+
self.name = os.getenv("PROFIL_NAME")
|
| 26 |
+
# if not self.name:
|
| 27 |
+
# name = extract_name_gliner(self.linkedin_profile)
|
| 28 |
+
# self.name = name["person"][0]
|
| 29 |
+
# print(f"Name found on Linkedin profile: {self.name}")
|
| 30 |
+
|
| 31 |
+
self.openai = AsyncOpenAI(api_key=self.openai_api_key)
|
| 32 |
+
|
| 33 |
+
# Build system prompt once
|
| 34 |
+
self.system_prompt = system_prompt
|
| 35 |
+
self.system_prompt += f"## Linkedin Profile:\n{self.linkedin_profile}\n\n"
|
| 36 |
+
self.system_prompt += f"## Addidional Information:\n{self.additional_info}\n\n"
|
| 37 |
+
self.system_prompt += f"With this context, please chat with user, always staying in character as {self.name}."
|
| 38 |
+
|
| 39 |
+
# Settings
|
| 40 |
+
self.max_history_turns = max_history_turns
|
| 41 |
+
self.similarity_threshold = similarity_thresh
|
| 42 |
+
|
| 43 |
+
# QA cache (question -> answer -> embedding)
|
| 44 |
+
self.qa_cache = [] # list of dict: {"question": str, "answer": str, "embedding": np.array}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def format_history(self, history):
|
| 48 |
+
return "\n".join(f"{turn['role'].upper()}: {turn['content']}" for turn in history)
|
| 49 |
+
|
| 50 |
+
async def embed(self, text: str):
|
| 51 |
+
"""Return embedding vector for text (uses OpenAI embeddings)."""
|
| 52 |
+
resp = await self.openai.embeddings.create(
|
| 53 |
+
model="text-embedding-3-small",
|
| 54 |
+
input=text
|
| 55 |
+
)
|
| 56 |
+
return np.array(resp.data[0].embedding)
|
| 57 |
+
|
| 58 |
+
def cosine_sim(self, a: np.ndarray, b: np.ndarray) -> float:
|
| 59 |
+
return float(cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0])
|
| 60 |
+
|
| 61 |
+
async def find_similar_question(self, new_question: str):
|
| 62 |
+
if not self.qa_cache:
|
| 63 |
+
return None, 0.0
|
| 64 |
+
new_emb = await self.embed(new_question)
|
| 65 |
+
best = None
|
| 66 |
+
best_sim = 0.0
|
| 67 |
+
for item in self.qa_cache:
|
| 68 |
+
sim = self.cosine_sim(new_emb, item["embedding"])
|
| 69 |
+
if sim > best_sim:
|
| 70 |
+
best_sim = sim
|
| 71 |
+
best = item
|
| 72 |
+
if best and best_sim >= self.similarity_threshold:
|
| 73 |
+
return best, best_sim
|
| 74 |
+
return None, best_sim
|
| 75 |
+
|
| 76 |
+
async def chat(self, message: str, history: list, **kwargs):
|
| 77 |
+
"""Main chat. Uses semantic QA cache and sliding window for tokens
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
message: user message string
|
| 81 |
+
history: existing list of dicts [{"role":...., "content":....}]
|
| 82 |
+
Returns:
|
| 83 |
+
reply string
|
| 84 |
+
"""
|
| 85 |
+
# Cache exact-match short-circuit
|
| 86 |
+
if message in (qa["question"] for qa in self.qa_cache):
|
| 87 |
+
# exact match
|
| 88 |
+
for qa in self.qa_cache:
|
| 89 |
+
if qa["question"] == message:
|
| 90 |
+
print("Using exact cached reply")
|
| 91 |
+
history.append({"role": "user", "content": message})
|
| 92 |
+
history.append({"role": "assistant", "content": qa["answer"]})
|
| 93 |
+
return qa["answer"]
|
| 94 |
+
|
| 95 |
+
# Check for semantically similar previous question
|
| 96 |
+
similar, sim_score = await self.find_similar_question(message)
|
| 97 |
+
if similar:
|
| 98 |
+
print(f"Reusing past answer (similarity={sim_score:.2%})")
|
| 99 |
+
refine_prompt = (
|
| 100 |
+
f"The user previously asked a similar question:\n"
|
| 101 |
+
+ f"Old question: {similar['question']}\n"
|
| 102 |
+
+ f"Old answer: {similar['answer']}\n\n"
|
| 103 |
+
+ f"Now user asks: {message}\n\n"
|
| 104 |
+
+ f"Please update or refine the old answer to match the new question."
|
| 105 |
+
)
|
| 106 |
+
messages = [{"role": "system", "content": self.system_prompt},
|
| 107 |
+
{"role": "user", "content": refine_prompt}]
|
| 108 |
+
try:
|
| 109 |
+
response = await self.openai.chat.completions.create(
|
| 110 |
+
model="gpt-4o-mini",
|
| 111 |
+
messages=messages
|
| 112 |
+
)
|
| 113 |
+
reply = response.choices[0].message.content
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"Error calling OpenAI for refinement: {e}")
|
| 116 |
+
reply = similar["answer"]
|
| 117 |
+
else:
|
| 118 |
+
# Build token-efficent context (sliding window)
|
| 119 |
+
temp_history = history + [{"role": "user", "content": message}]
|
| 120 |
+
context_for_api = temp_history[-self.max_history_turns:]
|
| 121 |
+
messages = [{"role": "system", "content": self.system_prompt}] + context_for_api
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
response = await self.openai.chat.completions.create(
|
| 125 |
+
model="gpt-4o-mini",
|
| 126 |
+
messages=messages
|
| 127 |
+
)
|
| 128 |
+
reply = response.choices[0].message.content
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Error calling OpenAI: {e}")
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
emb = await self.embed(message)
|
| 134 |
+
except Exception as e:
|
| 135 |
+
print(f"Embedding Error: {e}")
|
| 136 |
+
traceback.print_exc()
|
| 137 |
+
emb = None
|
| 138 |
+
|
| 139 |
+
self.qa_cache.append({
|
| 140 |
+
"question":message,
|
| 141 |
+
"answer":reply,
|
| 142 |
+
"embedding":emb
|
| 143 |
+
})
|
| 144 |
+
|
| 145 |
+
return reply
|
| 146 |
+
|
| 147 |
+
@traceable(run_type="chain", name="ProfileChat")
|
| 148 |
+
async def chat_traced(self, *args, **kwargs):
|
| 149 |
+
"""Wrapper for LangSmith tracing. Accepts any extra arguments
|
| 150 |
+
(like from Gradio) and passes only message/history to chat()."""
|
| 151 |
+
|
| 152 |
+
if len(args) >=2:
|
| 153 |
+
message, history = args[0], args[1]
|
| 154 |
+
else:
|
| 155 |
+
message = kwargs.get("message")
|
| 156 |
+
history = kwargs.get("history")
|
| 157 |
+
return await self.chat(message, history)
|
| 158 |
+
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
|
| 161 |
+
my_profile = MyProfileAvatarChat()
|
| 162 |
+
with gr.Blocks() as demo:
|
| 163 |
+
# Per-user chat history state
|
| 164 |
+
state = gr.State([])
|
| 165 |
+
|
| 166 |
+
# Chat interface
|
| 167 |
+
chat = gr.ChatInterface(
|
| 168 |
+
my_profile.chat_traced
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
demo.queue(max_size=10).launch(
|
| 172 |
+
server_name="0.0.0.0",
|
| 173 |
+
show_error=8000,
|
| 174 |
+
share=True
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
|
me/Linkedin_Profile.pdf
ADDED
|
Binary file (68 kB). View file
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
gradio>=5.22.0
|
| 3 |
+
langsmith>=0.3.18
|
| 4 |
+
openai>=1.68.2
|
| 5 |
+
pypdf>=5.4.0
|
| 6 |
+
python-dotenv>=1.0.1
|
| 7 |
+
requests>=2.32.3
|
| 8 |
+
setuptools>=78.1.0
|
| 9 |
+
scikit-learn>=1.7.2
|
| 10 |
+
# gliner2==1.0.2
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
src/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (1.05 kB). View file
|
|
|
src/__pycache__/file_loader.cpython-312.pyc
ADDED
|
Binary file (1.12 kB). View file
|
|
|
src/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (492 Bytes). View file
|
|
|
src/__pycache__/prompts.cpython-312.pyc
ADDED
|
Binary file (1.09 kB). View file
|
|
|
src/__pycache__/utils.cpython-312.pyc
ADDED
|
Binary file (711 Bytes). View file
|
|
|
src/config.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
from langsmith import Client, traceable
|
| 5 |
+
|
| 6 |
+
# ---------------------------------------------------------------------
|
| 7 |
+
# CONFIG
|
| 8 |
+
# ---------------------------------------------------------------------
|
| 9 |
+
class Config:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
load_dotenv(override=True)
|
| 12 |
+
self.openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 13 |
+
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
|
| 14 |
+
self.langsmith_endpoint = os.getenv("LANGSMITH_ENDPOINT")
|
| 15 |
+
|
| 16 |
+
# Initialize LangSmith
|
| 17 |
+
self.langsmith_client = Client(api_key=self.langsmith_api_key)
|
src/file_loader.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
from pypdf import PdfReader
|
| 3 |
+
|
| 4 |
+
@lru_cache()
|
| 5 |
+
def load_pdf_text(path: str) -> str:
|
| 6 |
+
text = ""
|
| 7 |
+
try:
|
| 8 |
+
reader = PdfReader(path)
|
| 9 |
+
for page in reader.pages:
|
| 10 |
+
page_text = page.extract_text()
|
| 11 |
+
if page_text:
|
| 12 |
+
text += page_text
|
| 13 |
+
except:
|
| 14 |
+
return ""
|
| 15 |
+
return text
|
| 16 |
+
|
| 17 |
+
@lru_cache()
|
| 18 |
+
def load_text_file(path: str) -> str:
|
| 19 |
+
try:
|
| 20 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 21 |
+
return f.read()
|
| 22 |
+
except:
|
| 23 |
+
return ""
|
src/models.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
|
| 3 |
+
class CacheEntry(BaseModel):
|
| 4 |
+
question: str
|
| 5 |
+
answer: str
|
| 6 |
+
embedding: list[float]
|
src/name_extractor.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gliner2 import GLiNER2
|
| 2 |
+
|
| 3 |
+
def extract_name_gliner(text: str) -> str:
|
| 4 |
+
extractor = GLiNER2.from_pretrained("fastino/gliner2-base-v1")
|
| 5 |
+
result = extractor.extract_entities(text[:700], ["person"])
|
| 6 |
+
return result["entities"]
|
src/prompts.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv(override=True)
|
| 5 |
+
|
| 6 |
+
name = os.getenv("PROFIL_NAME")
|
| 7 |
+
|
| 8 |
+
system_prompt = f"You are acting as {name}. You are answering question on {name}'s work experience and Life , \
|
| 9 |
+
particularly question related to {name}'s career, background, skills and experience. \
|
| 10 |
+
Your responsibility is to represent {name} for interactions on the website as faithfully as possible. \
|
| 11 |
+
Be professional and engaging, as if talking to a potential client or future employer who came across the website. \
|
| 12 |
+
Do not answer any questions which are not related to {name} porfolio. \
|
| 13 |
+
If you do not know the answer, say so and ask for contact to better answer questions agant cannot. \
|
| 14 |
+
If you need to check e.g salary expectation question then use tools to see what range for such position is. \
|
| 15 |
+
"
|
| 16 |
+
# When asked about professional experience, focus primarily on your data scientist experience. You may briefly mention past roles (e.g., Tesco, education) and acknowledge that your career path hasn't been linear, but emphasize that this variety has given you a broader perspective and valuable transferable skills. \
|
| 17 |
+
# Whenever appropriate, invite the person to contact you via email if they have further questions or would like to arrange a conversation.
|
| 18 |
+
# If you don't know the answer, state that clearly and honestly. \
|
| 19 |
+
# Don't use technologies if I do not have experience as Data Scientist e.g. R language - you never had experience with it.
|
src/utils.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.file_loader import load_pdf_text, load_text_file
|
| 2 |
+
|
| 3 |
+
# ---------------------------------------------------------------------
|
| 4 |
+
# FILE READER
|
| 5 |
+
# ---------------------------------------------------------------------
|
| 6 |
+
class FileReader:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.linkedin_profile = load_pdf_text("./me/Linkedin_Profile.pdf")
|
| 9 |
+
self.additional_info = load_text_file("./me/additional_info.txt")
|
| 10 |
+
|
| 11 |
+
# print("=== LINKEDIN PROFILE CONTENT ===")
|
| 12 |
+
# print(self.linkedin_profile)
|
| 13 |
+
# print("=== END ===")
|