Spaces:

anuragbb
/

Abot

Sleeping

App Files Files Community

rmt4genai commited on Mar 8

Commit

ee25382

1 Parent(s): 9493d15

renamed files

Browse files

Files changed (10) hide show

evaluation/{eval_2.py → eval_gemini_unranked.py} +0 -0
evaluation/{eval_1.py → eval_qwen.py} +0 -0
pro_implementation/answer_gemini.py → implementation/answer_gemini_adv +0 -0
implementation/{answer_1.py → answer_qwen.py} +0 -2
pro_implementation/answer.py → implementation/answer_qwen_adv +0 -0
implementation/{ingest_gemini.py → ingest_gemini_adv.py} +0 -0
pro_implementation/__pycache__/answer.cpython-312.pyc +0 -0
pro_implementation/__pycache__/answer_gemini.cpython-312.pyc +0 -0
pro_implementation/ingest.py +0 -143
pro_implementation/ingest_gemini.py +0 -144

evaluation/{eval_2.py → eval_gemini_unranked.py} RENAMED Viewed

File without changes

evaluation/{eval_1.py → eval_qwen.py} RENAMED Viewed

File without changes

pro_implementation/answer_gemini.py → implementation/answer_gemini_adv RENAMED Viewed

File without changes

implementation/{answer_1.py → answer_qwen.py} RENAMED Viewed

@@ -5,9 +5,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
 from langchain_core.documents import Document
 from ollama import Client
 from dotenv import load_dotenv
-from openai import OpenAI

 from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
 from langchain_core.documents import Document
 from ollama import Client
 from dotenv import load_dotenv

pro_implementation/answer.py → implementation/answer_qwen_adv RENAMED Viewed

File without changes

implementation/{ingest_gemini.py → ingest_gemini_adv.py} RENAMED Viewed

File without changes

pro_implementation/__pycache__/answer.cpython-312.pyc DELETED Viewed

Binary file (8.36 kB)

pro_implementation/__pycache__/answer_gemini.cpython-312.pyc DELETED Viewed

Binary file (8.96 kB)

pro_implementation/ingest.py DELETED Viewed

@@ -1,143 +0,0 @@
-from pathlib import Path
-from dotenv import load_dotenv
-from pydantic import BaseModel, Field
-from chromadb import PersistentClient
-from tqdm import tqdm
-from litellm import completion
-from multiprocessing import Pool
-from tenacity import retry, wait_exponential
-from langchain_huggingface import HuggingFaceEmbeddings
-load_dotenv(override=True)
-MODEL = "ollama/qwen3:4b"
-DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
-collection_name = "docs"
-embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
-KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
-AVERAGE_CHUNK_SIZE = 500
-wait = wait_exponential(multiplier=1, min=10, max=240)
-WORKERS = 2
-class Result(BaseModel):
-    page_content: str
-    metadata: dict
-class Chunk(BaseModel):
-    headline: str = Field(
-        description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query",
-    )
-    summary: str = Field(
-        description="A few sentences summarizing the content of this chunk to answer common questions"
-    )
-    original_text: str = Field(
-        description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
-    )
-    def as_result(self, document):
-        metadata = {"source": document["source"], "type": document["type"]}
-        return Result(
-            page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,
-            metadata=metadata,
-        )
-class Chunks(BaseModel):
-    chunks: list[Chunk]
-def fetch_documents():
-    """A homemade version of the LangChain DirectoryLoader"""
-    documents = []
-    for folder in KNOWLEDGE_BASE_PATH.iterdir():
-        doc_type = folder.name
-        for file in folder.rglob("*.md"):
-            with open(file, "r", encoding="utf-8") as f:
-                documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})
-    print(f"Loaded {len(documents)} documents")
-    return documents
-def make_prompt(document):
-    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
-    return f"""
-You take a document and you split the document into overlapping chunks for a KnowledgeBase.
-The document is from my portfolio website's github repo.
-The document is of type: {document["type"]}
-The document has been retrieved from: {document["source"]}
-A chatbot will use these chunks to answer questions about my skills, experience and projects.
-You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
-This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
-There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.
-For each chunk, you should provide a headline, a summary, and the original text of the chunk.
-Together your chunks should represent the entire document with overlap.
-Here is the document:
-{document["text"]}
-Respond with the chunks.
-"""
-def make_messages(document):
-    return [
-        {"role": "user", "content": make_prompt(document)},
-    ]
-@retry(wait=wait)
-def process_document(document):
-    messages = make_messages(document)
-    response = completion(model=MODEL, messages=messages,  response_format=Chunks,base_url="http://localhost:11434")
-    reply = response.choices[0].message.content
-    doc_as_chunks = Chunks.model_validate_json(reply).chunks
-    return [chunk.as_result(document) for chunk in doc_as_chunks]
-def create_chunks(documents):
-    """
-    Create chunks using a number of workers in parallel.
-    If you get a rate limit error, set the WORKERS to 1.
-    """
-    chunks = []
-    with Pool(processes=WORKERS) as pool:
-        for result in tqdm(pool.imap_unordered(process_document, documents), total=len(documents)):
-            chunks.extend(result)
-    return chunks
-def create_embeddings(chunks):
-    chroma = PersistentClient(path=DB_NAME)
-    if collection_name in [c.name for c in chroma.list_collections()]:
-        chroma.delete_collection(collection_name)
-    texts = [chunk.page_content for chunk in chunks]
-    vectors= embeddings.embed_documents(texts)
-    collection = chroma.get_or_create_collection(collection_name)
-    ids = [str(i) for i in range(len(chunks))]
-    metas = [chunk.metadata for chunk in chunks]
-    collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
-    print(f"Vectorstore created with {collection.count()} documents")
-if __name__ == "__main__":
-    documents = fetch_documents()
-    chunks = create_chunks(documents)
-    create_embeddings(chunks)
-    print("Ingestion complete")

pro_implementation/ingest_gemini.py DELETED Viewed

@@ -1,144 +0,0 @@
-from pathlib import Path
-from dotenv import load_dotenv
-from pydantic import BaseModel, Field
-from chromadb import PersistentClient
-from tqdm import tqdm
-from google import genai
-from google.genai import types
-from multiprocessing import Pool
-from tenacity import retry, wait_exponential
-from langchain_huggingface import HuggingFaceEmbeddings
-load_dotenv(override=True)
-MODEL = "gemini-2.5-flash"
-DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
-collection_name = "docs"
-embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")
-KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
-AVERAGE_CHUNK_SIZE = 500
-wait = wait_exponential(multiplier=1, min=10, max=240)
-client = genai.Client()
-WORKERS = 2
-class Result(BaseModel):
-    page_content: str
-    metadata: dict
-class Chunk(BaseModel):
-    headline: str = Field(
-        description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query",
-    )
-    summary: str = Field(
-        description="A few sentences summarizing the content of this chunk to answer common questions"
-    )
-    original_text: str = Field(
-        description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
-    )
-    def as_result(self, document):
-        metadata = {"source": document["source"], "type": document["type"]}
-        return Result(
-            page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,
-            metadata=metadata,
-        )
-class Chunks(BaseModel):
-    chunks: list[Chunk]
-def fetch_documents():
-    """A homemade version of the LangChain DirectoryLoader"""
-    documents = []
-    for folder in KNOWLEDGE_BASE_PATH.iterdir():
-        doc_type = folder.name
-        for file in folder.rglob("*.md"):
-            with open(file, "r", encoding="utf-8") as f:
-                documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})
-    print(f"Loaded {len(documents)} documents")
-    return documents
-def make_prompt(document):
-    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
-    return f"""
-You take a document and you split the document into overlapping chunks for a KnowledgeBase.
-The document is from my portfolio website's github repo.
-The document is of type: {document["type"]}
-The document has been retrieved from: {document["source"]}
-A chatbot will use these chunks to answer questions about my skills, experience and projects.
-You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
-This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
-There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.
-For each chunk, you should provide a headline, a summary, and the original text of the chunk.
-Together your chunks should represent the entire document with overlap.
-Here is the document:
-{document["text"]}
-Respond with the chunks.
-"""
-@retry(wait=wait)
-def process_document(document):
-    prompt = make_prompt(document)
-    response = client.models.generate_content(
-        model=MODEL,
-        contents=prompt,
-        config=types.GenerateContentConfig(
-            response_mime_type="application/json",
-            response_schema=Chunks,
-        ),
-    )
-    doc_as_chunks = Chunks.model_validate_json(response.text)
-    return [chunk.as_result(document) for chunk in doc_as_chunks.chunks]
-def create_chunks(documents):
-    """
-    Create chunks using a number of workers in parallel.
-    If you get a rate limit error, set the WORKERS to 1.
-    """
-    chunks = []
-    with Pool(processes=WORKERS) as pool:
-        for result in tqdm(pool.imap_unordered(process_document, documents), total=len(documents)):
-            chunks.extend(result)
-    return chunks
-def create_embeddings(chunks):
-    chroma = PersistentClient(path=DB_NAME)
-    if collection_name in [c.name for c in chroma.list_collections()]:
-        chroma.delete_collection(collection_name)
-    texts = [chunk.page_content for chunk in chunks]
-    vectors = embeddings.embed_documents(texts)
-    collection = chroma.get_or_create_collection(collection_name)
-    ids = [str(i) for i in range(len(chunks))]
-    metas = [chunk.metadata for chunk in chunks]
-    collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
-    print(f"Vectorstore created with {collection.count()} documents")
-if __name__ == "__main__":
-    documents = fetch_documents()
-    chunks = create_chunks(documents)
-    create_embeddings(chunks)
-    print("Ingestion complete")