Spaces:

ymali
/

bipolar

Sleeping

App Files Files Community

bipolar / src /Rag.py

ymali

use openai oss

6dffeff 6 months ago

raw

history blame contribute delete

12.7 kB

	import os
	import json
	import time
	import requests
	import numpy as np

	from dotenv import load_dotenv
	from sentence_transformers import SentenceTransformer
	from together import Together
	from openai import OpenAI

	global db, referenced_tables_db, embedder, index, llm_client


	def load_json_to_db(file_path):
	with open(file_path) as f:
	db = json.load(f)
	return db


	# -------- Embedding Functions --------
	def make_embeddings(embedder, embedder_name, db):
	texts = [chunk['text'] for chunk in db]
	embeddings = embedder.encode(texts, convert_to_numpy=True, batch_size=1, show_progress_bar=True)
	return embeddings


	def get_project_root():
	return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))


	def save_embeddings(embedder_name, embeddings):
	root = get_project_root()
	file_path = os.path.join(root, "data", "embeddings", f"{embedder_name.replace('/', '_')}.npy")
	os.makedirs(os.path.dirname(file_path), exist_ok=True)
	np.save(file_path, embeddings)
	print(f"Saved embeddings to: {file_path}")


	def load_embeddings(embedder_name):
	root = get_project_root()
	file_path = os.path.join(root, "data", "embeddings", f"{embedder_name.replace('/', '_')}.npy")

	try:
	embeddings = np.load(file_path, allow_pickle=True)
	print(f"Loaded embeddings from: {file_path}")
	except FileNotFoundError:
	print(f"Embeddings not found. Recomputing for: {embedder_name}")
	embeddings = make_embeddings(embedder, embedder_name, db)
	save_embeddings(embedder_name, embeddings)

	return embeddings


	def load_embedder_with_fallbacks(embedder_name):
	print(f"Loading embedder {embedder_name}")
	model = SentenceTransformer(
	embedder_name,
	trust_remote_code=True,
	tokenizer_kwargs={"padding_side": "left"},
	device='cpu'
	)
	return model


	# -------- Cosine Similarity Index (no FAISS) --------
	def build_cosine_index(embeddings):
	norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
	return embeddings / norms


	def load_cosine_index(embedder_name):
	embeddings = load_embeddings(embedder_name)
	normalized_embeddings = build_cosine_index(embeddings)
	return normalized_embeddings


	# -------- Cosine Similarity Search (Brute Force) --------
	def vector_search(query, embedder, db, index, referenced_table_db, k=6):
	def get_detailed_instruct(task_description: str, query: str) -> str:
	return f'Instruct: {task_description}\nQuery:{query}'

	task = 'Given a search query, retrieve relevant passages that answer the query'
	query_embedding = embedder.encode([get_detailed_instruct(task, query)], convert_to_numpy=True)
	query_vec = query_embedding / np.linalg.norm(query_embedding)

	cosine_similarities = np.dot(index, query_vec.T).flatten()
	top_k_indices = np.argsort(-cosine_similarities)[:k]

	results = []
	referenced_tables = set()
	existed_tables = set()

	for i in top_k_indices:
	results.append({
	"text": db[i]['text'],
	"section": db[i]['metadata']['section'],
	"chunk_id": db[i]['metadata']['chunk_id'],
	"similarity": float(cosine_similarities[i]),
	})
	if db[i]['metadata']['referee_id']:
	existed_tables.add(db[i]['metadata']['referee_id'])
	try:
	if db[i]['metadata']['referenced_tables']:
	referenced_tables.update(db[i]['metadata']['referenced_tables'])
	except KeyError:
	continue

	table_to_add = [table for table in referenced_tables if table not in existed_tables]

	for chunk in referenced_table_db:
	if chunk['metadata']['referee_id'] in table_to_add:
	results.append({
	"text": chunk['text'],
	"section": chunk['metadata']['section'],
	"chunk_id": chunk['metadata']['chunk_id'],
	})
	return results


	def load_together_llm_client():
	load_dotenv()
	return Together(api_key=os.getenv("TOGETHER_API_KEY"))

	def load_nvidia_llm_client():
	load_dotenv()
	return OpenAI(
	base_url="https://integrate.api.nvidia.com/v1",
	api_key=os.getenv("NVIDIA_API_KEY"),
	)



	# -------- Prompt Construction --------
	def construct_prompt(query, faiss_results):
	with open("src/system_prompt.txt", "r") as f:
	system_prompt = f.read().strip()

	prompt = f"""
	### System Prompt
	{system_prompt}

	### User Query
	{query}

	### Clinical Guidelines Context
	"""
	for res in faiss_results:
	prompt += f"- reference: {res['section']}\n- This paragraph is from section: {res['text']}\n"
	return prompt


	def construct_prompt_with_memory(query, faiss_results, chat_history=None, history_limit=4):
	with open("src/system_prompt.txt", "r") as f:
	system_prompt = f.read().strip()

	prompt = f"### System Prompt\n{system_prompt}\n\n"

	if chat_history:
	prompt += "### Chat History\n"
	for m in chat_history[-history_limit:]:
	prompt += f"{m['role'].title()}: {m['content']}\n"
	prompt += "\n"

	prompt += f"### User Query\n{query}\n\n"
	prompt += "### Clinical Guidelines Context\n"
	for res in faiss_results:
	prompt += f"- reference: {res['section']}\n- This paragraph is from section: {res['text']}\n"
	return prompt


	def call_llm(llm_client, prompt, stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9, model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"):
	print(f"Calling LLM with model: {model_name}")
	try:
	if stream_flag:
	def stream_generator():
	response = llm_client.chat.completions.create(
	model=model_name,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=True,
	)
	for chunk in response:
	if chunk.choices and chunk.choices[0].delta.content:
	yield chunk.choices[0].delta.content
	return stream_generator()
	else:
	response = llm_client.chat.completions.create(
	model=model_name,
	messages=[{"role": "user", "content": prompt}],
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=False,
	)
	return response.choices[0].message.content
	except Exception as e:
	print("Error in call_llm:", str(e))
	import traceback
	traceback.print_exc()
	raise

	def call_nvidia_llm(llm_client, prompt, stream_flag=False, max_tokens=4096, temperature=0.6, top_p=0.7, model_name="openai/gpt-oss-20b"):
	print(f"Calling NVIDIA LLM with model: {model_name}")
	try:
	if stream_flag:
	def stream_generator():
	completion = llm_client.chat.completions.create(
	model=model_name,
	messages=[{"role":"user","content": prompt}],
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens,
	stream=True
	)
	for chunk in completion:
	if chunk.choices[0].delta.content is not None:
	yield chunk.choices[0].delta.content
	return stream_generator()
	else:
	completion = llm_client.chat.completions.create(
	model=model_name,
	messages=[{"role":"user","content": prompt}],
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens,
	stream=False
	)
	return completion.choices[0].message.content
	except Exception as e:
	print("Error in call_nvidia_llm:", str(e))
	import traceback
	traceback.print_exc()
	raise



	def call_ollama(prompt, model="mistral", stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9):
	url = "http://localhost:11434/api/generate"
	payload = {
	"model": model,
	"prompt": prompt,
	"temperature": temperature,
	"top_p": top_p,
	"max_tokens": max_tokens,
	"stream": True
	}

	with requests.post(url, json=payload, stream=True) as response:
	for line in response.iter_lines():
	if line:
	try:
	data = json.loads(line.decode("utf-8"))
	yield data["response"]
	except Exception:
	continue


	# -------- Main Assistant Entry Points --------
	def launch_depression_assistant(embedder_name, designated_client=None):
	global db, referenced_tables_db, embedder, index, llm_client

	db = load_json_to_db("data/processed/guideline_db.json")
	referenced_tables_db = load_json_to_db("data/processed/referenced_table_chunks.json")

	embedder = load_embedder_with_fallbacks(embedder_name)
	index = load_cosine_index(embedder_name)

	if designated_client is None:
	print("Attempting to load NVIDIA LLM client...")
	try:
	llm_client = load_nvidia_llm_client()
	print("Successfully loaded NVIDIA LLM client.")
	except Exception as e:
	print(f"Failed to load NVIDIA LLM client: {e}")
	print("Attempting to load Together LLM client as a fallback...")
	try:
	llm_client = load_together_llm_client()
	print("Successfully loaded Together LLM client.")
	except Exception as e:
	print(f"Failed to load Together LLM client: {e}")
	llm_client = None
	else:
	llm_client = designated_client
	print(f"Using designated client: {type(llm_client).__name__}")

	if llm_client is None:
	print("Warning: No LLM client could be loaded. The assistant will not be able to generate responses.")

	print("---------Depression Assistant is ready to use!--------------\n\n")



	def depression_assistant(query, model_name=None, max_tokens=None, temperature=None, top_p=None, stream_flag=False, chat_history=None):
	results = vector_search(query, embedder, db, index, referenced_tables_db, k=3)
	prompt = construct_prompt_with_memory(query, results, chat_history=chat_history)

	kwargs = {}
	if model_name:
	kwargs['model_name'] = model_name
	if max_tokens:
	kwargs['max_tokens'] = max_tokens
	if temperature is not None:
	kwargs['temperature'] = temperature
	if top_p:
	kwargs['top_p'] = top_p

	if llm_client == "Run Ollama Locally":
	if 'model_name' in kwargs:
	kwargs['model'] = kwargs.pop('model_name')
	return results, call_ollama(prompt, stream_flag=stream_flag, **kwargs)
	elif isinstance(llm_client, OpenAI): # NVIDIA Client
	return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)
	elif isinstance(llm_client, Together): # Together Client
	return results, call_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)
	else:
	if llm_client is None:
	raise ValueError("LLM client not initialized. Please check API keys.")
	# Fallback to NVIDIA as requested
	return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)


	def load_queries_and_answers(query_file, answers_file):
	with open(query_file, 'r') as f:
	queries = f.readlines()
	with open(answers_file, 'r') as f:
	answers = f.readlines()
	return queries, answers


	def write_batched_results(embedder_name, result_path):
	launch_depression_assistant(embedder_name)
	queries, answers = load_queries_and_answers("data/raw/queries.txt", "data/raw/answers.txt")
	embedder_filename = embedder_name.replace('/', '_')

	with open(f"{result_path}Retrieved_Results_by_{embedder_filename}.md", "w") as f1, \
	open(f"{result_path}Response_by_{embedder_filename}.md", "w") as f2:

	for i, query in enumerate(queries):
	result, response = depression_assistant(query)

	f1.write(f"## Query {i+1}\n{query.strip()}\n\n## Answer\n{answers[i].strip()}\n\n## Retrieved Results\n")
	for res in result:
	f1.write(f"\n\n#### {res['section']}\n\n{res['text']}\n")
	f1.write("\n\n---\n\n")

	f2.write(f"## Query {i+1}\n{query.strip()}\n\n## Answer\n{answers[i].strip()}\n\n## Response\n{response}\n\n---\n\n")
	break # remove this `break` if you want to process all queries