rmt4genai commited on
Commit
ee25382
·
1 Parent(s): 9493d15

renamed files

Browse files
evaluation/{eval_2.py → eval_gemini_unranked.py} RENAMED
File without changes
evaluation/{eval_1.py → eval_qwen.py} RENAMED
File without changes
pro_implementation/answer_gemini.py → implementation/answer_gemini_adv RENAMED
File without changes
implementation/{answer_1.py → answer_qwen.py} RENAMED
@@ -5,9 +5,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
6
  from langchain_core.documents import Document
7
  from ollama import Client
8
-
9
  from dotenv import load_dotenv
10
- from openai import OpenAI
11
 
12
 
13
 
 
5
  from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
6
  from langchain_core.documents import Document
7
  from ollama import Client
 
8
  from dotenv import load_dotenv
 
9
 
10
 
11
 
pro_implementation/answer.py → implementation/answer_qwen_adv RENAMED
File without changes
implementation/{ingest_gemini.py → ingest_gemini_adv.py} RENAMED
File without changes
pro_implementation/__pycache__/answer.cpython-312.pyc DELETED
Binary file (8.36 kB)
 
pro_implementation/__pycache__/answer_gemini.cpython-312.pyc DELETED
Binary file (8.96 kB)
 
pro_implementation/ingest.py DELETED
@@ -1,143 +0,0 @@
1
- from pathlib import Path
2
- from dotenv import load_dotenv
3
- from pydantic import BaseModel, Field
4
- from chromadb import PersistentClient
5
- from tqdm import tqdm
6
- from litellm import completion
7
- from multiprocessing import Pool
8
- from tenacity import retry, wait_exponential
9
- from langchain_huggingface import HuggingFaceEmbeddings
10
-
11
- load_dotenv(override=True)
12
-
13
- MODEL = "ollama/qwen3:4b"
14
-
15
- DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
16
- collection_name = "docs"
17
- embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
18
- KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
19
- AVERAGE_CHUNK_SIZE = 500
20
- wait = wait_exponential(multiplier=1, min=10, max=240)
21
-
22
-
23
- WORKERS = 2
24
-
25
-
26
- class Result(BaseModel):
27
- page_content: str
28
- metadata: dict
29
-
30
-
31
- class Chunk(BaseModel):
32
- headline: str = Field(
33
- description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query",
34
- )
35
- summary: str = Field(
36
- description="A few sentences summarizing the content of this chunk to answer common questions"
37
- )
38
- original_text: str = Field(
39
- description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
40
- )
41
-
42
- def as_result(self, document):
43
- metadata = {"source": document["source"], "type": document["type"]}
44
- return Result(
45
- page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,
46
- metadata=metadata,
47
- )
48
-
49
-
50
- class Chunks(BaseModel):
51
- chunks: list[Chunk]
52
-
53
-
54
- def fetch_documents():
55
- """A homemade version of the LangChain DirectoryLoader"""
56
-
57
- documents = []
58
-
59
- for folder in KNOWLEDGE_BASE_PATH.iterdir():
60
- doc_type = folder.name
61
- for file in folder.rglob("*.md"):
62
- with open(file, "r", encoding="utf-8") as f:
63
- documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})
64
-
65
- print(f"Loaded {len(documents)} documents")
66
- return documents
67
-
68
-
69
- def make_prompt(document):
70
- how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
71
- return f"""
72
- You take a document and you split the document into overlapping chunks for a KnowledgeBase.
73
-
74
- The document is from my portfolio website's github repo.
75
- The document is of type: {document["type"]}
76
- The document has been retrieved from: {document["source"]}
77
-
78
- A chatbot will use these chunks to answer questions about my skills, experience and projects.
79
- You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
80
- This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
81
- There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.
82
-
83
- For each chunk, you should provide a headline, a summary, and the original text of the chunk.
84
- Together your chunks should represent the entire document with overlap.
85
-
86
- Here is the document:
87
-
88
- {document["text"]}
89
-
90
- Respond with the chunks.
91
- """
92
-
93
-
94
- def make_messages(document):
95
- return [
96
- {"role": "user", "content": make_prompt(document)},
97
- ]
98
-
99
-
100
- @retry(wait=wait)
101
- def process_document(document):
102
- messages = make_messages(document)
103
- response = completion(model=MODEL, messages=messages, response_format=Chunks,base_url="http://localhost:11434")
104
- reply = response.choices[0].message.content
105
- doc_as_chunks = Chunks.model_validate_json(reply).chunks
106
- return [chunk.as_result(document) for chunk in doc_as_chunks]
107
-
108
-
109
- def create_chunks(documents):
110
- """
111
- Create chunks using a number of workers in parallel.
112
- If you get a rate limit error, set the WORKERS to 1.
113
- """
114
- chunks = []
115
- with Pool(processes=WORKERS) as pool:
116
- for result in tqdm(pool.imap_unordered(process_document, documents), total=len(documents)):
117
- chunks.extend(result)
118
- return chunks
119
-
120
-
121
- def create_embeddings(chunks):
122
- chroma = PersistentClient(path=DB_NAME)
123
- if collection_name in [c.name for c in chroma.list_collections()]:
124
- chroma.delete_collection(collection_name)
125
-
126
- texts = [chunk.page_content for chunk in chunks]
127
- vectors= embeddings.embed_documents(texts)
128
-
129
-
130
- collection = chroma.get_or_create_collection(collection_name)
131
-
132
- ids = [str(i) for i in range(len(chunks))]
133
- metas = [chunk.metadata for chunk in chunks]
134
-
135
- collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
136
- print(f"Vectorstore created with {collection.count()} documents")
137
-
138
-
139
- if __name__ == "__main__":
140
- documents = fetch_documents()
141
- chunks = create_chunks(documents)
142
- create_embeddings(chunks)
143
- print("Ingestion complete")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pro_implementation/ingest_gemini.py DELETED
@@ -1,144 +0,0 @@
1
- from pathlib import Path
2
- from dotenv import load_dotenv
3
- from pydantic import BaseModel, Field
4
- from chromadb import PersistentClient
5
- from tqdm import tqdm
6
- from google import genai
7
- from google.genai import types
8
- from multiprocessing import Pool
9
- from tenacity import retry, wait_exponential
10
- from langchain_huggingface import HuggingFaceEmbeddings
11
-
12
- load_dotenv(override=True)
13
-
14
- MODEL = "gemini-2.5-flash"
15
-
16
- DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
17
- collection_name = "docs"
18
- embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
19
- KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
20
- AVERAGE_CHUNK_SIZE = 500
21
- wait = wait_exponential(multiplier=1, min=10, max=240)
22
-
23
- client = genai.Client()
24
-
25
- WORKERS = 2
26
-
27
-
28
- class Result(BaseModel):
29
- page_content: str
30
- metadata: dict
31
-
32
-
33
- class Chunk(BaseModel):
34
- headline: str = Field(
35
- description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query",
36
- )
37
- summary: str = Field(
38
- description="A few sentences summarizing the content of this chunk to answer common questions"
39
- )
40
- original_text: str = Field(
41
- description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
42
- )
43
-
44
- def as_result(self, document):
45
- metadata = {"source": document["source"], "type": document["type"]}
46
- return Result(
47
- page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,
48
- metadata=metadata,
49
- )
50
-
51
-
52
- class Chunks(BaseModel):
53
- chunks: list[Chunk]
54
-
55
-
56
- def fetch_documents():
57
- """A homemade version of the LangChain DirectoryLoader"""
58
-
59
- documents = []
60
-
61
- for folder in KNOWLEDGE_BASE_PATH.iterdir():
62
- doc_type = folder.name
63
- for file in folder.rglob("*.md"):
64
- with open(file, "r", encoding="utf-8") as f:
65
- documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})
66
-
67
- print(f"Loaded {len(documents)} documents")
68
- return documents
69
-
70
-
71
- def make_prompt(document):
72
- how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
73
- return f"""
74
- You take a document and you split the document into overlapping chunks for a KnowledgeBase.
75
-
76
- The document is from my portfolio website's github repo.
77
- The document is of type: {document["type"]}
78
- The document has been retrieved from: {document["source"]}
79
-
80
- A chatbot will use these chunks to answer questions about my skills, experience and projects.
81
- You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
82
- This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
83
- There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.
84
-
85
- For each chunk, you should provide a headline, a summary, and the original text of the chunk.
86
- Together your chunks should represent the entire document with overlap.
87
-
88
- Here is the document:
89
-
90
- {document["text"]}
91
-
92
- Respond with the chunks.
93
- """
94
-
95
-
96
- @retry(wait=wait)
97
- def process_document(document):
98
- prompt = make_prompt(document)
99
- response = client.models.generate_content(
100
- model=MODEL,
101
- contents=prompt,
102
- config=types.GenerateContentConfig(
103
- response_mime_type="application/json",
104
- response_schema=Chunks,
105
- ),
106
- )
107
- doc_as_chunks = Chunks.model_validate_json(response.text)
108
- return [chunk.as_result(document) for chunk in doc_as_chunks.chunks]
109
-
110
-
111
- def create_chunks(documents):
112
- """
113
- Create chunks using a number of workers in parallel.
114
- If you get a rate limit error, set the WORKERS to 1.
115
- """
116
- chunks = []
117
- with Pool(processes=WORKERS) as pool:
118
- for result in tqdm(pool.imap_unordered(process_document, documents), total=len(documents)):
119
- chunks.extend(result)
120
- return chunks
121
-
122
-
123
- def create_embeddings(chunks):
124
- chroma = PersistentClient(path=DB_NAME)
125
- if collection_name in [c.name for c in chroma.list_collections()]:
126
- chroma.delete_collection(collection_name)
127
-
128
- texts = [chunk.page_content for chunk in chunks]
129
- vectors = embeddings.embed_documents(texts)
130
-
131
- collection = chroma.get_or_create_collection(collection_name)
132
-
133
- ids = [str(i) for i in range(len(chunks))]
134
- metas = [chunk.metadata for chunk in chunks]
135
-
136
- collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
137
- print(f"Vectorstore created with {collection.count()} documents")
138
-
139
-
140
- if __name__ == "__main__":
141
- documents = fetch_documents()
142
- chunks = create_chunks(documents)
143
- create_embeddings(chunks)
144
- print("Ingestion complete")