|
|
import hashlib |
|
|
import json |
|
|
import re |
|
|
from pathlib import Path |
|
|
|
|
|
from dotenv import load_dotenv |
|
|
from llama_index.core import (QueryBundle) |
|
|
from llama_index.core.postprocessor import LLMRerank |
|
|
from nest_asyncio import apply |
|
|
from openai import OpenAI |
|
|
from tqdm import tqdm |
|
|
|
|
|
from llama_index.core import VectorStoreIndex, Settings |
|
|
from llama_index.embeddings.openai import OpenAIEmbedding |
|
|
|
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
|
|
|
|
from llama_index.core import Document |
|
|
|
|
|
|
|
|
embed_model = HuggingFaceEmbedding( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2" |
|
|
) |
|
|
Settings.embed_model = embed_model |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def build_documents(sections): |
|
|
docs = [] |
|
|
for s in sections: |
|
|
metadata = {"section_title": s["title"]} |
|
|
docs.append(Document(text=s["content"], metadata=metadata)) |
|
|
return docs |
|
|
|
|
|
def create_vector_index(docs): |
|
|
|
|
|
|
|
|
index = VectorStoreIndex.from_documents(docs) |
|
|
return index |
|
|
|
|
|
def split_markdown_by_section(md_path: str): |
|
|
text = Path(md_path).read_text(encoding="utf-8") |
|
|
sections = re.split(r"(?m)^# ", text) |
|
|
chunks = [] |
|
|
for section in sections: |
|
|
if not section.strip(): |
|
|
continue |
|
|
title, *content = section.split("\n", 1) |
|
|
body = content[0].strip() if content else "" |
|
|
chunks.append({"title": title.strip(), "content": body}) |
|
|
return chunks |
|
|
|
|
|
|
|
|
|
|
|
client = OpenAI() |
|
|
|
|
|
apply() |
|
|
|
|
|
tqdm.pandas() |
|
|
|
|
|
|
|
|
def hash_data(data): |
|
|
json_str = json.dumps(data, sort_keys=True) |
|
|
|
|
|
json_bytes = json_str.encode('utf-8') |
|
|
|
|
|
hash_hex = hashlib.sha256(json_bytes).hexdigest() |
|
|
|
|
|
return hash_hex |
|
|
|
|
|
|
|
|
def get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=True): |
|
|
query_bundle = QueryBundle(query) |
|
|
retriever = index.as_retriever(similarity_top_k=vector_top_k) |
|
|
retrieved_nodes = retriever.retrieve(query_bundle) |
|
|
|
|
|
if with_reranker: |
|
|
reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n) |
|
|
retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle) |
|
|
|
|
|
return retrieved_nodes |
|
|
|
|
|
|
|
|
def get_all_text(nodes): |
|
|
return ' '.join(f"\n- {node.get_text()}" for node in nodes) |
|
|
|
|
|
|
|
|
async def further_retrieve(query, index, messages): |
|
|
try: |
|
|
retrieved_nodes = get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=False) |
|
|
return completion(query, get_all_text(retrieved_nodes), messages) |
|
|
except Exception as e: |
|
|
print(e) |
|
|
return None |
|
|
|
|
|
|
|
|
async def completion(query, docs, messages): |
|
|
messages.extend([ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": f""" |
|
|
Given tone and voice guidelines and customer support help documents, act as a customer support bot. |
|
|
Answer any further questions as if you are customer support bot. |
|
|
TONE AND VOICE: |
|
|
promote the society, be gentle, be kind always positive. |
|
|
|
|
|
DOCUMENT: |
|
|
{docs} |
|
|
|
|
|
|
|
|
|
|
|
INSTRUCTIONS: |
|
|
|
|
|
- Answer the users QUESTION using the DOCUMENT text above. |
|
|
- Format formula into latex format between $...$ or \[...\] |
|
|
- Keep your answer ground in the facts of the DOCUMENT or chat history. |
|
|
- If document has an image markdown ,use it in your answer |
|
|
- Respond in same language as user Question |
|
|
- Use Markdown Structure |
|
|
- DOCUMENT can have images with there descriptions |
|
|
- if a text is followed by an image dont skip the image |
|
|
QUESTION: |
|
|
""" |
|
|
}, |
|
|
{ |
|
|
"role": "system", |
|
|
"content": query |
|
|
} |
|
|
]) |
|
|
completion = client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=messages, |
|
|
stream=True |
|
|
) |
|
|
for chunk in completion: |
|
|
if chunk.choices[0].delta.content: |
|
|
yield chunk.choices[0].delta.content |
|
|
|