Spaces:

kheopss
/

faq

Sleeping

App Files Files Community

kheopss commited on Nov 12, 2025

Commit

c107328

verified ·

1 Parent(s): f4dd31d

Update vdb.py

Browse files

Files changed (1) hide show

vdb.py +138 -129

vdb.py CHANGED Viewed

@@ -1,129 +1,138 @@
-import hashlib
-import json
-import re
-from pathlib import Path
-from dotenv import load_dotenv
-from llama_index.core import (QueryBundle)
-from llama_index.core.postprocessor import LLMRerank
-from nest_asyncio import apply
-from openai import OpenAI
-from tqdm import tqdm
-from llama_index.core import VectorStoreIndex
-from llama_index.embeddings.openai import OpenAIEmbedding
-from llama_index.core import Document
-# Load variables from .env
-load_dotenv()
-def build_documents(sections):
-    docs = []
-    for s in sections:
-        metadata = {"section_title": s["title"]}
-        docs.append(Document(text=s["content"], metadata=metadata))
-    return docs
-def create_vector_index(docs):
-    embed_model = OpenAIEmbedding()
-    index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
-    return index
-def split_markdown_by_section(md_path: str):
-    text = Path(md_path).read_text(encoding="utf-8")
-    sections = re.split(r"(?m)^# ", text)
-    chunks = []
-    for section in sections:
-        if not section.strip():
-            continue
-        title, *content = section.split("\n", 1)
-        body = content[0].strip() if content else ""
-        chunks.append({"title": title.strip(), "content": body})
-    return chunks
-client = OpenAI()
-apply()
-tqdm.pandas()
-def hash_data(data):
-    json_str = json.dumps(data, sort_keys=True)
-    json_bytes = json_str.encode('utf-8')
-    hash_hex = hashlib.sha256(json_bytes).hexdigest()
-    return hash_hex
-def get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=True):
-    query_bundle = QueryBundle(query)
-    retriever = index.as_retriever(similarity_top_k=vector_top_k)
-    retrieved_nodes = retriever.retrieve(query_bundle)
-    if with_reranker:
-        reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n)
-        retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
-    return retrieved_nodes
-def get_all_text(nodes):
-    return ' '.join(f"\n- {node.get_text()}" for node in nodes)
-async def further_retrieve(query, index, messages):
-    try:
-        retrieved_nodes = get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=False)
-        return completion(query, get_all_text(retrieved_nodes), messages)
-    except Exception as e:
-        print(e)
-        return None
-async def completion(query, docs, messages):
-    messages.extend([
-        {
-            "role": "system",
-            "content": f"""
-Given tone and voice guidelines and customer support help documents, act as a customer support bot.
-Answer any further questions as if you are customer support bot.
-TONE AND VOICE:
-promote the society, be gentle, be kind always positive.
-DOCUMENT:
-{docs}
-INSTRUCTIONS:
-- Answer the users QUESTION using the DOCUMENT text above.
-- Format formula into latex format between $...$ or \[...\]
-- Keep your answer ground in the facts of the DOCUMENT or chat history.
-- If document has an image markdown ,use it in your answer
-- Respond in same language as user Question
-- Use Markdown Structure
-- DOCUMENT can have images with there descriptions
-- if a text is followed by an image dont skip  the image
-QUESTION:
-              """
-        },
-        {
-            "role": "system",
-            "content": query
-        }
-    ])
-    completion = client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=messages,
-        stream=True
-    )
-    for chunk in completion:
-        if chunk.choices[0].delta.content:
-            yield chunk.choices[0].delta.content

+import hashlib
+import json
+import re
+from pathlib import Path
+from dotenv import load_dotenv
+from llama_index.core import (QueryBundle)
+from llama_index.core.postprocessor import LLMRerank
+from nest_asyncio import apply
+from openai import OpenAI
+from tqdm import tqdm
+from llama_index.core import VectorStoreIndex, Settings
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import Document
+embed_model = HuggingFaceEmbedding(
+        model_name="sentence-transformers/all-MiniLM-L6-v2"
+    )
+Settings.embed_model = embed_model
+# Load variables from .env
+load_dotenv()
+def build_documents(sections):
+    docs = []
+    for s in sections:
+        metadata = {"section_title": s["title"]}
+        docs.append(Document(text=s["content"], metadata=metadata))
+    return docs
+def create_vector_index(docs):
+    # embed_model = OpenAIEmbedding()
+    # index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
+    index = VectorStoreIndex.from_documents(docs)
+    return index
+def split_markdown_by_section(md_path: str):
+    text = Path(md_path).read_text(encoding="utf-8")
+    sections = re.split(r"(?m)^# ", text)
+    chunks = []
+    for section in sections:
+        if not section.strip():
+            continue
+        title, *content = section.split("\n", 1)
+        body = content[0].strip() if content else ""
+        chunks.append({"title": title.strip(), "content": body})
+    return chunks
+client = OpenAI()
+apply()
+tqdm.pandas()
+def hash_data(data):
+    json_str = json.dumps(data, sort_keys=True)
+    json_bytes = json_str.encode('utf-8')
+    hash_hex = hashlib.sha256(json_bytes).hexdigest()
+    return hash_hex
+def get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=True):
+    query_bundle = QueryBundle(query)
+    retriever = index.as_retriever(similarity_top_k=vector_top_k)
+    retrieved_nodes = retriever.retrieve(query_bundle)
+    if with_reranker:
+        reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n)
+        retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
+    return retrieved_nodes
+def get_all_text(nodes):
+    return ' '.join(f"\n- {node.get_text()}" for node in nodes)
+async def further_retrieve(query, index, messages):
+    try:
+        retrieved_nodes = get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=False)
+        return completion(query, get_all_text(retrieved_nodes), messages)
+    except Exception as e:
+        print(e)
+        return None
+async def completion(query, docs, messages):
+    messages.extend([
+        {
+            "role": "system",
+            "content": f"""
+Given tone and voice guidelines and customer support help documents, act as a customer support bot.
+Answer any further questions as if you are customer support bot.
+TONE AND VOICE:
+promote the society, be gentle, be kind always positive.
+DOCUMENT:
+{docs}
+INSTRUCTIONS:
+- Answer the users QUESTION using the DOCUMENT text above.
+- Format formula into latex format between $...$ or \[...\]
+- Keep your answer ground in the facts of the DOCUMENT or chat history.
+- If document has an image markdown ,use it in your answer
+- Respond in same language as user Question
+- Use Markdown Structure
+- DOCUMENT can have images with there descriptions
+- if a text is followed by an image dont skip  the image
+QUESTION:
+              """
+        },
+        {
+            "role": "system",
+            "content": query
+        }
+    ])
+    completion = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=messages,
+        stream=True
+    )
+    for chunk in completion:
+        if chunk.choices[0].delta.content:
+            yield chunk.choices[0].delta.content