rmt4genai commited on
Commit ·
ee25382
1
Parent(s): 9493d15
renamed files
Browse files- evaluation/{eval_2.py → eval_gemini_unranked.py} +0 -0
- evaluation/{eval_1.py → eval_qwen.py} +0 -0
- pro_implementation/answer_gemini.py → implementation/answer_gemini_adv +0 -0
- implementation/{answer_1.py → answer_qwen.py} +0 -2
- pro_implementation/answer.py → implementation/answer_qwen_adv +0 -0
- implementation/{ingest_gemini.py → ingest_gemini_adv.py} +0 -0
- pro_implementation/__pycache__/answer.cpython-312.pyc +0 -0
- pro_implementation/__pycache__/answer_gemini.cpython-312.pyc +0 -0
- pro_implementation/ingest.py +0 -143
- pro_implementation/ingest_gemini.py +0 -144
evaluation/{eval_2.py → eval_gemini_unranked.py}
RENAMED
|
File without changes
|
evaluation/{eval_1.py → eval_qwen.py}
RENAMED
|
File without changes
|
pro_implementation/answer_gemini.py → implementation/answer_gemini_adv
RENAMED
|
File without changes
|
implementation/{answer_1.py → answer_qwen.py}
RENAMED
|
@@ -5,9 +5,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
| 5 |
from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
from ollama import Client
|
| 8 |
-
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
-
from openai import OpenAI
|
| 11 |
|
| 12 |
|
| 13 |
|
|
|
|
| 5 |
from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
from ollama import Client
|
|
|
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
|
pro_implementation/answer.py → implementation/answer_qwen_adv
RENAMED
|
File without changes
|
implementation/{ingest_gemini.py → ingest_gemini_adv.py}
RENAMED
|
File without changes
|
pro_implementation/__pycache__/answer.cpython-312.pyc
DELETED
|
Binary file (8.36 kB)
|
|
|
pro_implementation/__pycache__/answer_gemini.cpython-312.pyc
DELETED
|
Binary file (8.96 kB)
|
|
|
pro_implementation/ingest.py
DELETED
|
@@ -1,143 +0,0 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
from pydantic import BaseModel, Field
|
| 4 |
-
from chromadb import PersistentClient
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
from litellm import completion
|
| 7 |
-
from multiprocessing import Pool
|
| 8 |
-
from tenacity import retry, wait_exponential
|
| 9 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
| 10 |
-
|
| 11 |
-
load_dotenv(override=True)
|
| 12 |
-
|
| 13 |
-
MODEL = "ollama/qwen3:4b"
|
| 14 |
-
|
| 15 |
-
DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
|
| 16 |
-
collection_name = "docs"
|
| 17 |
-
embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
|
| 18 |
-
KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
|
| 19 |
-
AVERAGE_CHUNK_SIZE = 500
|
| 20 |
-
wait = wait_exponential(multiplier=1, min=10, max=240)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
WORKERS = 2
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
class Result(BaseModel):
|
| 27 |
-
page_content: str
|
| 28 |
-
metadata: dict
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
class Chunk(BaseModel):
|
| 32 |
-
headline: str = Field(
|
| 33 |
-
description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query",
|
| 34 |
-
)
|
| 35 |
-
summary: str = Field(
|
| 36 |
-
description="A few sentences summarizing the content of this chunk to answer common questions"
|
| 37 |
-
)
|
| 38 |
-
original_text: str = Field(
|
| 39 |
-
description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
def as_result(self, document):
|
| 43 |
-
metadata = {"source": document["source"], "type": document["type"]}
|
| 44 |
-
return Result(
|
| 45 |
-
page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,
|
| 46 |
-
metadata=metadata,
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
class Chunks(BaseModel):
|
| 51 |
-
chunks: list[Chunk]
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def fetch_documents():
|
| 55 |
-
"""A homemade version of the LangChain DirectoryLoader"""
|
| 56 |
-
|
| 57 |
-
documents = []
|
| 58 |
-
|
| 59 |
-
for folder in KNOWLEDGE_BASE_PATH.iterdir():
|
| 60 |
-
doc_type = folder.name
|
| 61 |
-
for file in folder.rglob("*.md"):
|
| 62 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 63 |
-
documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})
|
| 64 |
-
|
| 65 |
-
print(f"Loaded {len(documents)} documents")
|
| 66 |
-
return documents
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
def make_prompt(document):
|
| 70 |
-
how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
|
| 71 |
-
return f"""
|
| 72 |
-
You take a document and you split the document into overlapping chunks for a KnowledgeBase.
|
| 73 |
-
|
| 74 |
-
The document is from my portfolio website's github repo.
|
| 75 |
-
The document is of type: {document["type"]}
|
| 76 |
-
The document has been retrieved from: {document["source"]}
|
| 77 |
-
|
| 78 |
-
A chatbot will use these chunks to answer questions about my skills, experience and projects.
|
| 79 |
-
You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
|
| 80 |
-
This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
|
| 81 |
-
There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.
|
| 82 |
-
|
| 83 |
-
For each chunk, you should provide a headline, a summary, and the original text of the chunk.
|
| 84 |
-
Together your chunks should represent the entire document with overlap.
|
| 85 |
-
|
| 86 |
-
Here is the document:
|
| 87 |
-
|
| 88 |
-
{document["text"]}
|
| 89 |
-
|
| 90 |
-
Respond with the chunks.
|
| 91 |
-
"""
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def make_messages(document):
|
| 95 |
-
return [
|
| 96 |
-
{"role": "user", "content": make_prompt(document)},
|
| 97 |
-
]
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
@retry(wait=wait)
|
| 101 |
-
def process_document(document):
|
| 102 |
-
messages = make_messages(document)
|
| 103 |
-
response = completion(model=MODEL, messages=messages, response_format=Chunks,base_url="http://localhost:11434")
|
| 104 |
-
reply = response.choices[0].message.content
|
| 105 |
-
doc_as_chunks = Chunks.model_validate_json(reply).chunks
|
| 106 |
-
return [chunk.as_result(document) for chunk in doc_as_chunks]
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def create_chunks(documents):
|
| 110 |
-
"""
|
| 111 |
-
Create chunks using a number of workers in parallel.
|
| 112 |
-
If you get a rate limit error, set the WORKERS to 1.
|
| 113 |
-
"""
|
| 114 |
-
chunks = []
|
| 115 |
-
with Pool(processes=WORKERS) as pool:
|
| 116 |
-
for result in tqdm(pool.imap_unordered(process_document, documents), total=len(documents)):
|
| 117 |
-
chunks.extend(result)
|
| 118 |
-
return chunks
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
def create_embeddings(chunks):
|
| 122 |
-
chroma = PersistentClient(path=DB_NAME)
|
| 123 |
-
if collection_name in [c.name for c in chroma.list_collections()]:
|
| 124 |
-
chroma.delete_collection(collection_name)
|
| 125 |
-
|
| 126 |
-
texts = [chunk.page_content for chunk in chunks]
|
| 127 |
-
vectors= embeddings.embed_documents(texts)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
collection = chroma.get_or_create_collection(collection_name)
|
| 131 |
-
|
| 132 |
-
ids = [str(i) for i in range(len(chunks))]
|
| 133 |
-
metas = [chunk.metadata for chunk in chunks]
|
| 134 |
-
|
| 135 |
-
collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
|
| 136 |
-
print(f"Vectorstore created with {collection.count()} documents")
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
if __name__ == "__main__":
|
| 140 |
-
documents = fetch_documents()
|
| 141 |
-
chunks = create_chunks(documents)
|
| 142 |
-
create_embeddings(chunks)
|
| 143 |
-
print("Ingestion complete")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pro_implementation/ingest_gemini.py
DELETED
|
@@ -1,144 +0,0 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
from pydantic import BaseModel, Field
|
| 4 |
-
from chromadb import PersistentClient
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
from google import genai
|
| 7 |
-
from google.genai import types
|
| 8 |
-
from multiprocessing import Pool
|
| 9 |
-
from tenacity import retry, wait_exponential
|
| 10 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
| 11 |
-
|
| 12 |
-
load_dotenv(override=True)
|
| 13 |
-
|
| 14 |
-
MODEL = "gemini-2.5-flash"
|
| 15 |
-
|
| 16 |
-
DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
|
| 17 |
-
collection_name = "docs"
|
| 18 |
-
embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
|
| 19 |
-
KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
|
| 20 |
-
AVERAGE_CHUNK_SIZE = 500
|
| 21 |
-
wait = wait_exponential(multiplier=1, min=10, max=240)
|
| 22 |
-
|
| 23 |
-
client = genai.Client()
|
| 24 |
-
|
| 25 |
-
WORKERS = 2
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
class Result(BaseModel):
|
| 29 |
-
page_content: str
|
| 30 |
-
metadata: dict
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
class Chunk(BaseModel):
|
| 34 |
-
headline: str = Field(
|
| 35 |
-
description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query",
|
| 36 |
-
)
|
| 37 |
-
summary: str = Field(
|
| 38 |
-
description="A few sentences summarizing the content of this chunk to answer common questions"
|
| 39 |
-
)
|
| 40 |
-
original_text: str = Field(
|
| 41 |
-
description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
|
| 42 |
-
)
|
| 43 |
-
|
| 44 |
-
def as_result(self, document):
|
| 45 |
-
metadata = {"source": document["source"], "type": document["type"]}
|
| 46 |
-
return Result(
|
| 47 |
-
page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,
|
| 48 |
-
metadata=metadata,
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
class Chunks(BaseModel):
|
| 53 |
-
chunks: list[Chunk]
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def fetch_documents():
|
| 57 |
-
"""A homemade version of the LangChain DirectoryLoader"""
|
| 58 |
-
|
| 59 |
-
documents = []
|
| 60 |
-
|
| 61 |
-
for folder in KNOWLEDGE_BASE_PATH.iterdir():
|
| 62 |
-
doc_type = folder.name
|
| 63 |
-
for file in folder.rglob("*.md"):
|
| 64 |
-
with open(file, "r", encoding="utf-8") as f:
|
| 65 |
-
documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})
|
| 66 |
-
|
| 67 |
-
print(f"Loaded {len(documents)} documents")
|
| 68 |
-
return documents
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
def make_prompt(document):
|
| 72 |
-
how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
|
| 73 |
-
return f"""
|
| 74 |
-
You take a document and you split the document into overlapping chunks for a KnowledgeBase.
|
| 75 |
-
|
| 76 |
-
The document is from my portfolio website's github repo.
|
| 77 |
-
The document is of type: {document["type"]}
|
| 78 |
-
The document has been retrieved from: {document["source"]}
|
| 79 |
-
|
| 80 |
-
A chatbot will use these chunks to answer questions about my skills, experience and projects.
|
| 81 |
-
You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
|
| 82 |
-
This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
|
| 83 |
-
There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.
|
| 84 |
-
|
| 85 |
-
For each chunk, you should provide a headline, a summary, and the original text of the chunk.
|
| 86 |
-
Together your chunks should represent the entire document with overlap.
|
| 87 |
-
|
| 88 |
-
Here is the document:
|
| 89 |
-
|
| 90 |
-
{document["text"]}
|
| 91 |
-
|
| 92 |
-
Respond with the chunks.
|
| 93 |
-
"""
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
@retry(wait=wait)
|
| 97 |
-
def process_document(document):
|
| 98 |
-
prompt = make_prompt(document)
|
| 99 |
-
response = client.models.generate_content(
|
| 100 |
-
model=MODEL,
|
| 101 |
-
contents=prompt,
|
| 102 |
-
config=types.GenerateContentConfig(
|
| 103 |
-
response_mime_type="application/json",
|
| 104 |
-
response_schema=Chunks,
|
| 105 |
-
),
|
| 106 |
-
)
|
| 107 |
-
doc_as_chunks = Chunks.model_validate_json(response.text)
|
| 108 |
-
return [chunk.as_result(document) for chunk in doc_as_chunks.chunks]
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def create_chunks(documents):
|
| 112 |
-
"""
|
| 113 |
-
Create chunks using a number of workers in parallel.
|
| 114 |
-
If you get a rate limit error, set the WORKERS to 1.
|
| 115 |
-
"""
|
| 116 |
-
chunks = []
|
| 117 |
-
with Pool(processes=WORKERS) as pool:
|
| 118 |
-
for result in tqdm(pool.imap_unordered(process_document, documents), total=len(documents)):
|
| 119 |
-
chunks.extend(result)
|
| 120 |
-
return chunks
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
def create_embeddings(chunks):
|
| 124 |
-
chroma = PersistentClient(path=DB_NAME)
|
| 125 |
-
if collection_name in [c.name for c in chroma.list_collections()]:
|
| 126 |
-
chroma.delete_collection(collection_name)
|
| 127 |
-
|
| 128 |
-
texts = [chunk.page_content for chunk in chunks]
|
| 129 |
-
vectors = embeddings.embed_documents(texts)
|
| 130 |
-
|
| 131 |
-
collection = chroma.get_or_create_collection(collection_name)
|
| 132 |
-
|
| 133 |
-
ids = [str(i) for i in range(len(chunks))]
|
| 134 |
-
metas = [chunk.metadata for chunk in chunks]
|
| 135 |
-
|
| 136 |
-
collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
|
| 137 |
-
print(f"Vectorstore created with {collection.count()} documents")
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
if __name__ == "__main__":
|
| 141 |
-
documents = fetch_documents()
|
| 142 |
-
chunks = create_chunks(documents)
|
| 143 |
-
create_embeddings(chunks)
|
| 144 |
-
print("Ingestion complete")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|