Spaces:

Asish22
/

code-crawler

Sleeping

App Files Files Community

GitHub Actions commited on Oct 5, 2024

Commit

ba41aa8

1 Parent(s): 7ca251e

Auto-format code with isort and black

Browse files

Files changed (5) hide show

benchmarks/retrieval/retrieve.py +8 -6
sage/config.py +3 -1
sage/embedder.py +10 -19
sage/reranker.py +1 -0
sage/vector_store.py +1 -1

benchmarks/retrieval/retrieve.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Make sure to `pip install ir_measures` before running this script.
 """
 import json
 import logging
 import os
@@ -21,6 +22,7 @@ logger.setLevel(logging.INFO)
 load_dotenv()
 def main():
     parser = configargparse.ArgParser(
         description="Runs retrieval on a benchmark dataset.", ignore_unknown_config_file_keys=True
@@ -49,12 +51,12 @@ def main():
     args = parser.parse_args()
     sage.config.validate_vector_store_args(args)
     repo_manager = GitHubRepoManager(
-            args.repo_id,
-            commit_hash=args.commit_hash,
-            access_token=os.getenv("GITHUB_TOKEN"),
-            local_dir=args.local_dir,
-            inclusion_file=args.include,
-            exclusion_file=args.exclude,
     )
     repo_manager.download()
     retriever = build_retriever_from_args(args, repo_manager)

 Make sure to `pip install ir_measures` before running this script.
 """
 import json
 import logging
 import os
 load_dotenv()
 def main():
     parser = configargparse.ArgParser(
         description="Runs retrieval on a benchmark dataset.", ignore_unknown_config_file_keys=True
     args = parser.parse_args()
     sage.config.validate_vector_store_args(args)
     repo_manager = GitHubRepoManager(
+        args.repo_id,
+        commit_hash=args.commit_hash,
+        access_token=os.getenv("GITHUB_TOKEN"),
+        local_dir=args.local_dir,
+        inclusion_file=args.include,
+        exclusion_file=args.exclude,
     )
     repo_manager.download()
     retriever = build_retriever_from_args(args, repo_manager)

sage/config.py CHANGED Viewed

@@ -313,7 +313,9 @@ def _validate_gemini_embedding_args(args):
     """Validates the configuration of the Gemini batch embedder and sets defaults."""
     if not args.embedding_model:
         args.embedding_model = "models/text-embedding-004"
-    assert os.environ["GOOGLE_API_KEY"], "Please set the GOOGLE_API_KEY environment variable if using `gemini` embeddings."
     if not args.chunks_per_batch:
         args.chunks_per_batch = GEMINI_MAX_CHUNKS_PER_BATCH
     elif args.chunks_per_batch > GEMINI_MAX_CHUNKS_PER_BATCH:

     """Validates the configuration of the Gemini batch embedder and sets defaults."""
     if not args.embedding_model:
         args.embedding_model = "models/text-embedding-004"
+    assert os.environ[
+        "GOOGLE_API_KEY"
+    ], "Please set the GOOGLE_API_KEY environment variable if using `gemini` embeddings."
     if not args.chunks_per_batch:
         args.chunks_per_batch = GEMINI_MAX_CHUNKS_PER_BATCH
     elif args.chunks_per_batch > GEMINI_MAX_CHUNKS_PER_BATCH:

sage/embedder.py CHANGED Viewed

@@ -4,25 +4,17 @@ import json
 import logging
 import os
 import time
-from abc import ABC
-from abc import abstractmethod
 from collections import Counter
-from typing import Dict
-from typing import Generator
-from typing import List
-from typing import Optional
-from typing import Tuple
 import google.generativeai as genai
 import marqo
 import requests
 from openai import OpenAI
-from tenacity import retry
-from tenacity import stop_after_attempt
-from tenacity import wait_random_exponential
-from sage.chunker import Chunk
-from sage.chunker import Chunker
 from sage.constants import TEXT_FIELD
 from sage.data_manager import DataManager
@@ -72,7 +64,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
-                    sub_batch = batch[i: i + chunks_per_batch]
                     openai_batch_id = self._issue_job_for_chunks(sub_batch, batch_id=f"{dataset_name}/{len(batch_ids)}")
                     batch_ids[openai_batch_id] = [chunk.metadata for chunk in sub_batch]
                     if max_embedding_jobs and len(batch_ids) >= max_embedding_jobs:
@@ -242,7 +234,7 @@ class VoyageBatchEmbedder(BatchEmbedder):
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
-                    sub_batch = batch[i: i + chunks_per_batch]
                     logging.info("Embedding %d chunks...", len(sub_batch))
                     result = self._make_batch_request(sub_batch)
                     for chunk, datum in zip(sub_batch, result["data"]):
@@ -314,7 +306,7 @@ class MarqoEmbedder(BatchEmbedder):
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
-                    sub_batch = batch[i: i + chunks_per_batch]
                     logging.info("Indexing %d chunks...", len(sub_batch))
                     self.index.add_documents(
                         documents=[chunk.metadata for chunk in sub_batch],
@@ -356,9 +348,8 @@ class GeminiBatchEmbedder(BatchEmbedder):
     def _make_batch_request(self, chunks: List[Chunk]) -> Dict:
         return genai.embed_content(
-            model=self.embedding_model,
-            content=[chunk.content for chunk in chunks],
-            task_type="retrieval_document")
     def embed_dataset(self, chunks_per_batch: int, max_embedding_jobs: int = None):
         """Issues batch embedding jobs for the entire dataset."""
@@ -375,7 +366,7 @@ class GeminiBatchEmbedder(BatchEmbedder):
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
-                    sub_batch = batch[i: i + chunks_per_batch]
                     logging.info("Embedding %d chunks...", len(sub_batch))
                     result = self._make_batch_request(sub_batch)
                     for chunk, embedding in zip(sub_batch, result["embedding"]):

 import logging
 import os
 import time
+from abc import ABC, abstractmethod
 from collections import Counter
+from typing import Dict, Generator, List, Optional, Tuple
 import google.generativeai as genai
 import marqo
 import requests
 from openai import OpenAI
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+from sage.chunker import Chunk, Chunker
 from sage.constants import TEXT_FIELD
 from sage.data_manager import DataManager
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
+                    sub_batch = batch[i : i + chunks_per_batch]
                     openai_batch_id = self._issue_job_for_chunks(sub_batch, batch_id=f"{dataset_name}/{len(batch_ids)}")
                     batch_ids[openai_batch_id] = [chunk.metadata for chunk in sub_batch]
                     if max_embedding_jobs and len(batch_ids) >= max_embedding_jobs:
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
+                    sub_batch = batch[i : i + chunks_per_batch]
                     logging.info("Embedding %d chunks...", len(sub_batch))
                     result = self._make_batch_request(sub_batch)
                     for chunk, datum in zip(sub_batch, result["data"]):
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
+                    sub_batch = batch[i : i + chunks_per_batch]
                     logging.info("Indexing %d chunks...", len(sub_batch))
                     self.index.add_documents(
                         documents=[chunk.metadata for chunk in sub_batch],
     def _make_batch_request(self, chunks: List[Chunk]) -> Dict:
         return genai.embed_content(
+            model=self.embedding_model, content=[chunk.content for chunk in chunks], task_type="retrieval_document"
+        )
     def embed_dataset(self, chunks_per_batch: int, max_embedding_jobs: int = None):
         """Issues batch embedding jobs for the entire dataset."""
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
+                    sub_batch = batch[i : i + chunks_per_batch]
                     logging.info("Embedding %d chunks...", len(sub_batch))
                     result = self._make_batch_request(sub_batch)
                     for chunk, embedding in zip(sub_batch, result["embedding"]):

sage/reranker.py CHANGED Viewed

@@ -10,6 +10,7 @@ from langchain_core.documents import BaseDocumentCompressor
 from langchain_nvidia_ai_endpoints import NVIDIARerank
 from langchain_voyageai import VoyageAIRerank
 class RerankerProvider(Enum):
     NONE = "none"
     HUGGINGFACE = "huggingface"

 from langchain_nvidia_ai_endpoints import NVIDIARerank
 from langchain_voyageai import VoyageAIRerank
 class RerankerProvider(Enum):
     NONE = "none"
     HUGGINGFACE = "huggingface"

sage/vector_store.py CHANGED Viewed

@@ -198,7 +198,7 @@ def build_vector_store_from_args(args: dict, data_manager: Optional[DataManager]
             else:
                 print("punkt is not downloaded")
                 # Optionally download it
-                nltk.download('punkt_tab')
             corpus = [content for content, _ in data_manager.walk()]
             bm25_encoder = BM25Encoder()
             bm25_encoder.fit(corpus)

             else:
                 print("punkt is not downloaded")
                 # Optionally download it
+                nltk.download("punkt_tab")
             corpus = [content for content, _ in data_manager.walk()]
             bm25_encoder = BM25Encoder()
             bm25_encoder.fit(corpus)