Spaces:

kheopss
/

faq

Sleeping

App Files Files Community

faq / vdb.py

kheopss

Update vdb.py

c107328 verified 2 months ago

raw

history blame contribute delete

3.83 kB

	import hashlib
	import json
	import re
	from pathlib import Path

	from dotenv import load_dotenv
	from llama_index.core import (QueryBundle)
	from llama_index.core.postprocessor import LLMRerank
	from nest_asyncio import apply
	from openai import OpenAI
	from tqdm import tqdm

	from llama_index.core import VectorStoreIndex, Settings
	from llama_index.embeddings.openai import OpenAIEmbedding

	from llama_index.embeddings.huggingface import HuggingFaceEmbedding

	from llama_index.core import Document


	embed_model = HuggingFaceEmbedding(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)
	Settings.embed_model = embed_model

	# Load variables from .env
	load_dotenv()

	def build_documents(sections):
	docs = []
	for s in sections:
	metadata = {"section_title": s["title"]}
	docs.append(Document(text=s["content"], metadata=metadata))
	return docs

	def create_vector_index(docs):
	# embed_model = OpenAIEmbedding()
	# index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
	index = VectorStoreIndex.from_documents(docs)
	return index

	def split_markdown_by_section(md_path: str):
	text = Path(md_path).read_text(encoding="utf-8")
	sections = re.split(r"(?m)^# ", text)
	chunks = []
	for section in sections:
	if not section.strip():
	continue
	title, *content = section.split("\n", 1)
	body = content[0].strip() if content else ""
	chunks.append({"title": title.strip(), "content": body})
	return chunks



	client = OpenAI()

	apply()

	tqdm.pandas()


	def hash_data(data):
	json_str = json.dumps(data, sort_keys=True)

	json_bytes = json_str.encode('utf-8')

	hash_hex = hashlib.sha256(json_bytes).hexdigest()

	return hash_hex


	def get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=True):
	query_bundle = QueryBundle(query)
	retriever = index.as_retriever(similarity_top_k=vector_top_k)
	retrieved_nodes = retriever.retrieve(query_bundle)

	if with_reranker:
	reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n)
	retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)

	return retrieved_nodes


	def get_all_text(nodes):
	return ' '.join(f"\n- {node.get_text()}" for node in nodes)


	async def further_retrieve(query, index, messages):
	try:
	retrieved_nodes = get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=False)
	return completion(query, get_all_text(retrieved_nodes), messages)
	except Exception as e:
	print(e)
	return None


	async def completion(query, docs, messages):
	messages.extend([
	{
	"role": "system",
	"content": f"""
	Given tone and voice guidelines and customer support help documents, act as a customer support bot.
	Answer any further questions as if you are customer support bot.
	TONE AND VOICE:
	promote the society, be gentle, be kind always positive.

	DOCUMENT:
	{docs}



	INSTRUCTIONS:

	- Answer the users QUESTION using the DOCUMENT text above.
	- Format formula into latex format between $...$ or \[...\]
	- Keep your answer ground in the facts of the DOCUMENT or chat history.
	- If document has an image markdown ,use it in your answer
	- Respond in same language as user Question
	- Use Markdown Structure
	- DOCUMENT can have images with there descriptions
	- if a text is followed by an image dont skip the image
	QUESTION:
	"""
	},
	{
	"role": "system",
	"content": query
	}
	])
	completion = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=messages,
	stream=True
	)
	for chunk in completion:
	if chunk.choices[0].delta.content:
	yield chunk.choices[0].delta.content