|
|
import os |
|
|
import json |
|
|
import time |
|
|
import requests |
|
|
import numpy as np |
|
|
|
|
|
from dotenv import load_dotenv |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from together import Together |
|
|
from openai import OpenAI |
|
|
|
|
|
global db, referenced_tables_db, embedder, index, llm_client |
|
|
|
|
|
|
|
|
def load_json_to_db(file_path): |
|
|
with open(file_path) as f: |
|
|
db = json.load(f) |
|
|
return db |
|
|
|
|
|
|
|
|
|
|
|
def make_embeddings(embedder, embedder_name, db): |
|
|
texts = [chunk['text'] for chunk in db] |
|
|
embeddings = embedder.encode(texts, convert_to_numpy=True, batch_size=1, show_progress_bar=True) |
|
|
return embeddings |
|
|
|
|
|
|
|
|
def get_project_root(): |
|
|
return os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) |
|
|
|
|
|
|
|
|
def save_embeddings(embedder_name, embeddings): |
|
|
root = get_project_root() |
|
|
file_path = os.path.join(root, "data", "embeddings", f"{embedder_name.replace('/', '_')}.npy") |
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True) |
|
|
np.save(file_path, embeddings) |
|
|
print(f"Saved embeddings to: {file_path}") |
|
|
|
|
|
|
|
|
def load_embeddings(embedder_name): |
|
|
root = get_project_root() |
|
|
file_path = os.path.join(root, "data", "embeddings", f"{embedder_name.replace('/', '_')}.npy") |
|
|
|
|
|
try: |
|
|
embeddings = np.load(file_path, allow_pickle=True) |
|
|
print(f"Loaded embeddings from: {file_path}") |
|
|
except FileNotFoundError: |
|
|
print(f"Embeddings not found. Recomputing for: {embedder_name}") |
|
|
embeddings = make_embeddings(embedder, embedder_name, db) |
|
|
save_embeddings(embedder_name, embeddings) |
|
|
|
|
|
return embeddings |
|
|
|
|
|
|
|
|
def load_embedder_with_fallbacks(embedder_name): |
|
|
print(f"Loading embedder {embedder_name}") |
|
|
model = SentenceTransformer( |
|
|
embedder_name, |
|
|
trust_remote_code=True, |
|
|
tokenizer_kwargs={"padding_side": "left"}, |
|
|
device='cpu' |
|
|
) |
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
def build_cosine_index(embeddings): |
|
|
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) |
|
|
return embeddings / norms |
|
|
|
|
|
|
|
|
def load_cosine_index(embedder_name): |
|
|
embeddings = load_embeddings(embedder_name) |
|
|
normalized_embeddings = build_cosine_index(embeddings) |
|
|
return normalized_embeddings |
|
|
|
|
|
|
|
|
|
|
|
def vector_search(query, embedder, db, index, referenced_table_db, k=6): |
|
|
def get_detailed_instruct(task_description: str, query: str) -> str: |
|
|
return f'Instruct: {task_description}\nQuery:{query}' |
|
|
|
|
|
task = 'Given a search query, retrieve relevant passages that answer the query' |
|
|
query_embedding = embedder.encode([get_detailed_instruct(task, query)], convert_to_numpy=True) |
|
|
query_vec = query_embedding / np.linalg.norm(query_embedding) |
|
|
|
|
|
cosine_similarities = np.dot(index, query_vec.T).flatten() |
|
|
top_k_indices = np.argsort(-cosine_similarities)[:k] |
|
|
|
|
|
results = [] |
|
|
referenced_tables = set() |
|
|
existed_tables = set() |
|
|
|
|
|
for i in top_k_indices: |
|
|
results.append({ |
|
|
"text": db[i]['text'], |
|
|
"section": db[i]['metadata']['section'], |
|
|
"chunk_id": db[i]['metadata']['chunk_id'], |
|
|
"similarity": float(cosine_similarities[i]), |
|
|
}) |
|
|
if db[i]['metadata']['referee_id']: |
|
|
existed_tables.add(db[i]['metadata']['referee_id']) |
|
|
try: |
|
|
if db[i]['metadata']['referenced_tables']: |
|
|
referenced_tables.update(db[i]['metadata']['referenced_tables']) |
|
|
except KeyError: |
|
|
continue |
|
|
|
|
|
table_to_add = [table for table in referenced_tables if table not in existed_tables] |
|
|
|
|
|
for chunk in referenced_table_db: |
|
|
if chunk['metadata']['referee_id'] in table_to_add: |
|
|
results.append({ |
|
|
"text": chunk['text'], |
|
|
"section": chunk['metadata']['section'], |
|
|
"chunk_id": chunk['metadata']['chunk_id'], |
|
|
}) |
|
|
return results |
|
|
|
|
|
|
|
|
def load_together_llm_client(): |
|
|
load_dotenv() |
|
|
return Together(api_key=os.getenv("TOGETHER_API_KEY")) |
|
|
|
|
|
def load_nvidia_llm_client(): |
|
|
load_dotenv() |
|
|
return OpenAI( |
|
|
base_url="https://integrate.api.nvidia.com/v1", |
|
|
api_key=os.getenv("NVIDIA_API_KEY"), |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def construct_prompt(query, faiss_results): |
|
|
with open("src/system_prompt.txt", "r") as f: |
|
|
system_prompt = f.read().strip() |
|
|
|
|
|
prompt = f""" |
|
|
### System Prompt |
|
|
{system_prompt} |
|
|
|
|
|
### User Query |
|
|
{query} |
|
|
|
|
|
### Clinical Guidelines Context |
|
|
""" |
|
|
for res in faiss_results: |
|
|
prompt += f"- reference: {res['section']}\n- This paragraph is from section: {res['text']}\n" |
|
|
return prompt |
|
|
|
|
|
|
|
|
def construct_prompt_with_memory(query, faiss_results, chat_history=None, history_limit=4): |
|
|
with open("src/system_prompt.txt", "r") as f: |
|
|
system_prompt = f.read().strip() |
|
|
|
|
|
prompt = f"### System Prompt\n{system_prompt}\n\n" |
|
|
|
|
|
if chat_history: |
|
|
prompt += "### Chat History\n" |
|
|
for m in chat_history[-history_limit:]: |
|
|
prompt += f"{m['role'].title()}: {m['content']}\n" |
|
|
prompt += "\n" |
|
|
|
|
|
prompt += f"### User Query\n{query}\n\n" |
|
|
prompt += "### Clinical Guidelines Context\n" |
|
|
for res in faiss_results: |
|
|
prompt += f"- reference: {res['section']}\n- This paragraph is from section: {res['text']}\n" |
|
|
return prompt |
|
|
|
|
|
|
|
|
def call_llm(llm_client, prompt, stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9, model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"): |
|
|
print(f"Calling LLM with model: {model_name}") |
|
|
try: |
|
|
if stream_flag: |
|
|
def stream_generator(): |
|
|
response = llm_client.chat.completions.create( |
|
|
model=model_name, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
max_tokens=max_tokens, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
stream=True, |
|
|
) |
|
|
for chunk in response: |
|
|
if chunk.choices and chunk.choices[0].delta.content: |
|
|
yield chunk.choices[0].delta.content |
|
|
return stream_generator() |
|
|
else: |
|
|
response = llm_client.chat.completions.create( |
|
|
model=model_name, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
max_tokens=max_tokens, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
stream=False, |
|
|
) |
|
|
return response.choices[0].message.content |
|
|
except Exception as e: |
|
|
print("Error in call_llm:", str(e)) |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
raise |
|
|
|
|
|
def call_nvidia_llm(llm_client, prompt, stream_flag=False, max_tokens=4096, temperature=0.6, top_p=0.7, model_name="openai/gpt-oss-20b"): |
|
|
print(f"Calling NVIDIA LLM with model: {model_name}") |
|
|
try: |
|
|
if stream_flag: |
|
|
def stream_generator(): |
|
|
completion = llm_client.chat.completions.create( |
|
|
model=model_name, |
|
|
messages=[{"role":"user","content": prompt}], |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
max_tokens=max_tokens, |
|
|
stream=True |
|
|
) |
|
|
for chunk in completion: |
|
|
if chunk.choices[0].delta.content is not None: |
|
|
yield chunk.choices[0].delta.content |
|
|
return stream_generator() |
|
|
else: |
|
|
completion = llm_client.chat.completions.create( |
|
|
model=model_name, |
|
|
messages=[{"role":"user","content": prompt}], |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
max_tokens=max_tokens, |
|
|
stream=False |
|
|
) |
|
|
return completion.choices[0].message.content |
|
|
except Exception as e: |
|
|
print("Error in call_nvidia_llm:", str(e)) |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
raise |
|
|
|
|
|
|
|
|
|
|
|
def call_ollama(prompt, model="mistral", stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9): |
|
|
url = "http://localhost:11434/api/generate" |
|
|
payload = { |
|
|
"model": model, |
|
|
"prompt": prompt, |
|
|
"temperature": temperature, |
|
|
"top_p": top_p, |
|
|
"max_tokens": max_tokens, |
|
|
"stream": True |
|
|
} |
|
|
|
|
|
with requests.post(url, json=payload, stream=True) as response: |
|
|
for line in response.iter_lines(): |
|
|
if line: |
|
|
try: |
|
|
data = json.loads(line.decode("utf-8")) |
|
|
yield data["response"] |
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
def launch_depression_assistant(embedder_name, designated_client=None): |
|
|
global db, referenced_tables_db, embedder, index, llm_client |
|
|
|
|
|
db = load_json_to_db("data/processed/guideline_db.json") |
|
|
referenced_tables_db = load_json_to_db("data/processed/referenced_table_chunks.json") |
|
|
|
|
|
embedder = load_embedder_with_fallbacks(embedder_name) |
|
|
index = load_cosine_index(embedder_name) |
|
|
|
|
|
if designated_client is None: |
|
|
print("Attempting to load NVIDIA LLM client...") |
|
|
try: |
|
|
llm_client = load_nvidia_llm_client() |
|
|
print("Successfully loaded NVIDIA LLM client.") |
|
|
except Exception as e: |
|
|
print(f"Failed to load NVIDIA LLM client: {e}") |
|
|
print("Attempting to load Together LLM client as a fallback...") |
|
|
try: |
|
|
llm_client = load_together_llm_client() |
|
|
print("Successfully loaded Together LLM client.") |
|
|
except Exception as e: |
|
|
print(f"Failed to load Together LLM client: {e}") |
|
|
llm_client = None |
|
|
else: |
|
|
llm_client = designated_client |
|
|
print(f"Using designated client: {type(llm_client).__name__}") |
|
|
|
|
|
if llm_client is None: |
|
|
print("Warning: No LLM client could be loaded. The assistant will not be able to generate responses.") |
|
|
|
|
|
print("---------Depression Assistant is ready to use!--------------\n\n") |
|
|
|
|
|
|
|
|
|
|
|
def depression_assistant(query, model_name=None, max_tokens=None, temperature=None, top_p=None, stream_flag=False, chat_history=None): |
|
|
results = vector_search(query, embedder, db, index, referenced_tables_db, k=3) |
|
|
prompt = construct_prompt_with_memory(query, results, chat_history=chat_history) |
|
|
|
|
|
kwargs = {} |
|
|
if model_name: |
|
|
kwargs['model_name'] = model_name |
|
|
if max_tokens: |
|
|
kwargs['max_tokens'] = max_tokens |
|
|
if temperature is not None: |
|
|
kwargs['temperature'] = temperature |
|
|
if top_p: |
|
|
kwargs['top_p'] = top_p |
|
|
|
|
|
if llm_client == "Run Ollama Locally": |
|
|
if 'model_name' in kwargs: |
|
|
kwargs['model'] = kwargs.pop('model_name') |
|
|
return results, call_ollama(prompt, stream_flag=stream_flag, **kwargs) |
|
|
elif isinstance(llm_client, OpenAI): |
|
|
return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs) |
|
|
elif isinstance(llm_client, Together): |
|
|
return results, call_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs) |
|
|
else: |
|
|
if llm_client is None: |
|
|
raise ValueError("LLM client not initialized. Please check API keys.") |
|
|
|
|
|
return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs) |
|
|
|
|
|
|
|
|
def load_queries_and_answers(query_file, answers_file): |
|
|
with open(query_file, 'r') as f: |
|
|
queries = f.readlines() |
|
|
with open(answers_file, 'r') as f: |
|
|
answers = f.readlines() |
|
|
return queries, answers |
|
|
|
|
|
|
|
|
def write_batched_results(embedder_name, result_path): |
|
|
launch_depression_assistant(embedder_name) |
|
|
queries, answers = load_queries_and_answers("data/raw/queries.txt", "data/raw/answers.txt") |
|
|
embedder_filename = embedder_name.replace('/', '_') |
|
|
|
|
|
with open(f"{result_path}Retrieved_Results_by_{embedder_filename}.md", "w") as f1, \ |
|
|
open(f"{result_path}Response_by_{embedder_filename}.md", "w") as f2: |
|
|
|
|
|
for i, query in enumerate(queries): |
|
|
result, response = depression_assistant(query) |
|
|
|
|
|
f1.write(f"## Query {i+1}\n{query.strip()}\n\n## Answer\n{answers[i].strip()}\n\n## Retrieved Results\n") |
|
|
for res in result: |
|
|
f1.write(f"\n\n#### {res['section']}\n\n{res['text']}\n") |
|
|
f1.write("\n\n---\n\n") |
|
|
|
|
|
f2.write(f"## Query {i+1}\n{query.strip()}\n\n## Answer\n{answers[i].strip()}\n\n## Response\n{response}\n\n---\n\n") |
|
|
break |
|
|
|