Spaces:

vichudo
/

agentic-defensor

Sleeping

App Files Files Community

vichudo commited on Mar 18, 2025

Commit

6a4bd6f

1 Parent(s): 8abf329

fix

Browse files

Files changed (4) hide show

.gitignore +10 -0
src/data/__init__.py +0 -0
src/data/document_processor.py +135 -0
src/embeddings/embedder.py +86 -0

.gitignore CHANGED Viewed

@@ -10,6 +10,16 @@ pdfs/
 !requirements.txt
 !docker-compose.yml
 # Python
 __pycache__/
 *.py[cod]

 !requirements.txt
 !docker-compose.yml
+# Allow src directory structure
+!src/
+!src/**
+!src/embeddings/
+!src/embeddings/**
+!src/models/
+!src/models/**
+!src/agents/
+!src/agents/**
 # Python
 __pycache__/
 *.py[cod]

src/data/__init__.py ADDED Viewed

File without changes

src/data/document_processor.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+import pickle
+import faiss
+import numpy as np
+from typing import List, Dict, Any, Optional, Tuple
+from tqdm import tqdm
+from src.utils.config import DATA_DIR, EMBEDDINGS_DIR
+from src.embeddings.embedder import TextEmbedder
+class DocumentProcessor:
+    """
+    Handles document loading, chunking, and processing operations.
+    """
+    def __init__(self, data_dir: str = DATA_DIR, embeddings_dir: str = EMBEDDINGS_DIR):
+        """
+        Initialize the document processor.
+        Args:
+            data_dir: Directory containing the document files
+            embeddings_dir: Directory for storing embeddings and indexes
+        """
+        self.data_dir = data_dir
+        self.embeddings_dir = embeddings_dir
+        self.embedder = TextEmbedder()
+        # Create directories if they don't exist
+        os.makedirs(data_dir, exist_ok=True)
+        os.makedirs(embeddings_dir, exist_ok=True)
+    def process_documents(self, doc_chunks: List[Dict[str, Any]], save: bool = True) -> Tuple[List[Dict[str, Any]], List[List[float]]]:
+        """
+        Process document chunks by generating embeddings and creating a FAISS index.
+        Args:
+            doc_chunks: List of document chunks to process
+            save: Whether to save the processed data to disk
+        Returns:
+            Tuple containing the document chunks and their embeddings
+        """
+        print(f"Processing {len(doc_chunks)} document chunks...")
+        # Extract text chunks for embedding
+        texts = [chunk["chunk"] for chunk in doc_chunks]
+        # Generate embeddings
+        print("Generating embeddings...")
+        embeddings = self.embedder.get_embeddings_for_texts(texts)
+        # Save the results if requested
+        if save:
+            self._save_processed_data(doc_chunks, embeddings)
+        return doc_chunks, embeddings
+    def create_faiss_index(self, embeddings: List[List[float]], save: bool = True) -> faiss.Index:
+        """
+        Create a FAISS index from the document embeddings.
+        Args:
+            embeddings: List of embedding vectors
+            save: Whether to save the index to disk
+        Returns:
+            FAISS index
+        """
+        print("Creating FAISS index...")
+        # Convert embeddings to numpy array
+        embedding_array = np.array(embeddings, dtype='float32')
+        # Get dimensions
+        vector_dimension = embedding_array.shape[1]
+        # Create the index
+        index = faiss.IndexFlatL2(vector_dimension)
+        index.add(embedding_array)
+        print(f"Created FAISS index with {index.ntotal} vectors of dimension {vector_dimension}")
+        # Save the index if requested
+        if save:
+            index_path = os.path.join(self.embeddings_dir, "faiss_index.index")
+            faiss.write_index(index, index_path)
+            print(f"FAISS index saved to {index_path}")
+        return index
+    def _save_processed_data(self, doc_chunks: List[Dict[str, Any]], embeddings: List[List[float]]) -> None:
+        """
+        Save the processed document chunks and embeddings to disk.
+        Args:
+            doc_chunks: List of document chunks
+            embeddings: List of embedding vectors
+        """
+        # Save document chunks
+        chunks_path = os.path.join(self.data_dir, "doc_chunks.pkl")
+        with open(chunks_path, "wb") as f:
+            pickle.dump(doc_chunks, f)
+        print(f"Document chunks saved to {chunks_path}")
+        # Save embeddings
+        embeddings_path = os.path.join(self.embeddings_dir, "embeddings.pkl")
+        with open(embeddings_path, "wb") as f:
+            pickle.dump(embeddings, f)
+        print(f"Embeddings saved to {embeddings_path}")
+    def load_processed_data(self) -> Tuple[List[Dict[str, Any]], List[List[float]], faiss.Index]:
+        """
+        Load processed document chunks, embeddings, and FAISS index from disk.
+        Returns:
+            Tuple containing document chunks, embeddings, and FAISS index
+        """
+        # Load document chunks
+        chunks_path = os.path.join(self.data_dir, "doc_chunks.pkl")
+        with open(chunks_path, "rb") as f:
+            doc_chunks = pickle.load(f)
+        print(f"Document chunks loaded from {chunks_path}")
+        # Load embeddings
+        embeddings_path = os.path.join(self.embeddings_dir, "embeddings.pkl")
+        with open(embeddings_path, "rb") as f:
+            embeddings = pickle.load(f)
+        print(f"Embeddings loaded from {embeddings_path}")
+        # Load FAISS index
+        index_path = os.path.join(self.embeddings_dir, "faiss_index.index")
+        index = faiss.read_index(index_path)
+        print(f"FAISS index loaded from {index_path}")
+        return doc_chunks, embeddings, index

src/embeddings/embedder.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import time
+import numpy as np
+from tqdm import tqdm
+from openai import OpenAI
+from typing import List, Dict, Any, Optional
+from src.utils.config import EMBEDDING_MODEL, EMBEDDING_BATCH_SIZE, OPENAI_API_KEY
+class TextEmbedder:
+    """Class for generating embeddings for document chunks using OpenAI's embeddings API."""
+    def __init__(self, model: str = EMBEDDING_MODEL, batch_size: int = EMBEDDING_BATCH_SIZE):
+        """
+        Initialize the TextEmbedder with the specified embedding model and batch size.
+        Args:
+            model: The OpenAI embedding model to use
+            batch_size: Number of chunks to embed per API call
+        """
+        self.model = model
+        self.batch_size = batch_size
+        self.client = OpenAI(api_key=OPENAI_API_KEY)
+        self.embedding_dim = 1536  # Default dimension for text-embedding-3-small
+    def get_embedding_for_text(self, text: str) -> List[float]:
+        """Generate embedding for a single text."""
+        try:
+            response = self.client.embeddings.create(
+                input=[text],
+                model=self.model
+            )
+            return response.data[0].embedding
+        except Exception as e:
+            print(f"Error generating embedding: {e}")
+            return [0.0] * self.embedding_dim
+    def get_embeddings_for_texts(self, texts: List[str]) -> List[List[float]]:
+        """
+        Compute embeddings for a list of texts using batched API calls.
+        Args:
+            texts: List of text chunks to embed
+        Returns:
+            List of embedding vectors
+        """
+        embeddings = []
+        for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding chunks"):
+            batch = texts[i:i + self.batch_size]
+            try:
+                response = self.client.embeddings.create(
+                    input=batch,
+                    model=self.model
+                )
+                # Extract embeddings from the response
+                for item in response.data:
+                    embeddings.append(item.embedding)
+            except Exception as e:
+                print(f"Error embedding batch starting at index {i}: {e}")
+                # Append placeholder zero vectors for failed texts
+                for _ in batch:
+                    embeddings.append([0.0] * self.embedding_dim)
+            # Brief pause to avoid rate limits
+            time.sleep(0.2)
+        return embeddings
+    def get_query_embedding(self, query: str) -> np.ndarray:
+        """
+        Generate embedding for a query string and return as numpy array.
+        Args:
+            query: The query text to embed
+        Returns:
+            Numpy array of the embedding
+        """
+        try:
+            q_response = self.client.embeddings.create(
+                input=[query],
+                model=self.model
+            )
+            return np.array(q_response.data[0].embedding, dtype='float32').reshape(1, -1)
+        except Exception as e:
+            print(f"Error creating embedding for query: {e}")
+            return np.zeros((1, self.embedding_dim), dtype='float32')