faq / vdb.py
kheopss's picture
Update vdb.py
c107328 verified
import hashlib
import json
import re
from pathlib import Path
from dotenv import load_dotenv
from llama_index.core import (QueryBundle)
from llama_index.core.postprocessor import LLMRerank
from nest_asyncio import apply
from openai import OpenAI
from tqdm import tqdm
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document
embed_model = HuggingFaceEmbedding(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
Settings.embed_model = embed_model
# Load variables from .env
load_dotenv()
def build_documents(sections):
docs = []
for s in sections:
metadata = {"section_title": s["title"]}
docs.append(Document(text=s["content"], metadata=metadata))
return docs
def create_vector_index(docs):
# embed_model = OpenAIEmbedding()
# index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
index = VectorStoreIndex.from_documents(docs)
return index
def split_markdown_by_section(md_path: str):
text = Path(md_path).read_text(encoding="utf-8")
sections = re.split(r"(?m)^# ", text)
chunks = []
for section in sections:
if not section.strip():
continue
title, *content = section.split("\n", 1)
body = content[0].strip() if content else ""
chunks.append({"title": title.strip(), "content": body})
return chunks
client = OpenAI()
apply()
tqdm.pandas()
def hash_data(data):
json_str = json.dumps(data, sort_keys=True)
json_bytes = json_str.encode('utf-8')
hash_hex = hashlib.sha256(json_bytes).hexdigest()
return hash_hex
def get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=True):
query_bundle = QueryBundle(query)
retriever = index.as_retriever(similarity_top_k=vector_top_k)
retrieved_nodes = retriever.retrieve(query_bundle)
if with_reranker:
reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n)
retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
return retrieved_nodes
def get_all_text(nodes):
return ' '.join(f"\n- {node.get_text()}" for node in nodes)
async def further_retrieve(query, index, messages):
try:
retrieved_nodes = get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=False)
return completion(query, get_all_text(retrieved_nodes), messages)
except Exception as e:
print(e)
return None
async def completion(query, docs, messages):
messages.extend([
{
"role": "system",
"content": f"""
Given tone and voice guidelines and customer support help documents, act as a customer support bot.
Answer any further questions as if you are customer support bot.
TONE AND VOICE:
promote the society, be gentle, be kind always positive.
DOCUMENT:
{docs}
INSTRUCTIONS:
- Answer the users QUESTION using the DOCUMENT text above.
- Format formula into latex format between $...$ or \[...\]
- Keep your answer ground in the facts of the DOCUMENT or chat history.
- If document has an image markdown ,use it in your answer
- Respond in same language as user Question
- Use Markdown Structure
- DOCUMENT can have images with there descriptions
- if a text is followed by an image dont skip the image
QUESTION:
"""
},
{
"role": "system",
"content": query
}
])
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
stream=True
)
for chunk in completion:
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content