Upload 5 files

Browse files

Files changed (5) hide show

config.yaml +30 -0
config_loader.py +6 -0
embedding_pipeline.py +133 -0
query_processor.py +54 -0
reranker.py +32 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+models:
+  sentence_transformer: "all-MiniLM-L6-v2"
+  openai: "text-embedding-ada-002"
+api_keys:
+  openai: "your-openai-api-key"
+chroma:
+  collection_name: "documents"
+faiss:
+  dimension: 384  # Should match the SentenceTransformer model output dimension
+fields:
+  legal_department:
+    prompt: "Which department should the completed form be returned to?"
+    embedding_method: "sentence_transformer"
+    top_k: 3
+  party_info:
+    prompt: "Who are the eligible parties for forwarding collateral?"
+    embedding_method: "openai"
+    top_k: 2
+  contact_info:
+    prompt: "What is the contact information for the trading desk?"
+    embedding_method: "chroma"
+    top_k: 1
+  scope:
+    prompt: "What is the scope of review mentioned?"
+    embedding_method: "faiss"
+    top_k: 2

config_loader.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import yaml
+def load_config(config_path: str) -> dict:
+    """Load configuration from a YAML file."""
+    with open(config_path, 'r') as f:
+        return yaml.safe_load(f)

embedding_pipeline.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import json
+from typing import List, Dict, Any
+import chromadb
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from openai import OpenAI
+from reranker import Reranker
+from query_processor import QueryProcessor
+from config_loader import load_config
+class EmbeddingPipeline:
+    def __init__(self, config_path: str):
+        # Load configuration
+        self.config = load_config(config_path)
+        # Initialize embedding models
+        self.sentence_model = SentenceTransformer(self.config['models']['sentence_transformer'])
+        self.openai_client = OpenAI(api_key=self.config['api_keys']['openai'])
+        # Initialize ChromaDB
+        self.chroma_client = chromadb.Client()
+        self.chroma_collection = self.chroma_client.create_collection(
+            name=self.config['chroma']['collection_name']
+        )
+        # Initialize FAISS
+        self.dimension = self.config['faiss']['dimension']
+        self.faiss_index = faiss.IndexFlatL2(self.dimension)
+        # Store documents and embeddings
+        self.documents = []
+        self.chroma_embeddings = []
+        self.faiss_embeddings = []
+        self.openai_embeddings = []
+        self.sentence_embeddings = []
+        # Initialize reranker and query processor
+        self.reranker = Reranker()
+        self.query_processor = QueryProcessor(
+            sentence_model=self.sentence_model,
+            openai_client=self.openai_client,
+            chroma_collection=self.chroma_collection,
+            faiss_index=self.faiss_index
+        )
+    def load_data(self, json_data: Dict) -> List[Dict]:
+        """Load and flatten JSON data into documents."""
+        documents = []
+        for item in json_data:
+            doc = {
+                'content': item['content'],
+                'type': item['type'],
+                'page': item['page'],
+                'id': f"doc_{len(documents)}"
+            }
+            documents.append(doc)
+        self.documents = documents
+        return documents
+    def generate_embeddings(self):
+        """Generate embeddings using all methods."""
+        texts = [doc['content'] for doc in self.documents]
+        # Sentence Transformer embeddings
+        self.sentence_embeddings = self.sentence_model.encode(texts)
+        # OpenAI embeddings
+        openai_response = self.openai_client.embeddings.create(
+            input=texts,
+            model=self.config['models']['openai']
+        )
+        self.openai_embeddings = [embedding.embedding for embedding in openai_response.data]
+        # ChromaDB embeddings
+        self.chroma_collection.add(
+            documents=texts,
+            ids=[doc['id'] for doc in self.documents]
+        )
+        self.chroma_embeddings = self.chroma_collection.get(include=['embeddings'])['embeddings']
+        # FAISS embeddings (using Sentence Transformer embeddings for FAISS)
+        self.faiss_embeddings = np.array(self.sentence_embeddings)
+        self.faiss_index.add(self.faiss_embeddings)
+        # Prepare reranker
+        self.reranker.prepare(self.documents)
+    def process_queries(self) -> Dict[str, List[Dict]]:
+        """Process all configured field queries."""
+        results = {}
+        for field, settings in self.config['fields'].items():
+            prompt = settings['prompt']
+            top_k = settings['top_k']
+            method = settings['embedding_method']
+            # Query using the appropriate method
+            initial_results = self.query_processor.query(
+                prompt=prompt,
+                documents=self.documents,
+                embedding_method=method,
+                top_k=top_k,
+                sentence_embeddings=self.sentence_embeddings,
+                openai_embeddings=self.openai_embeddings
+            )
+            # Rerank results
+            reranked_results = self.reranker.rerank(prompt, initial_results, top_k)
+            results[field] = reranked_results
+        return results
+# Example usage
+if __name__ == "__main__":
+    # Sample JSON data (replace with actual input)
+    json_data = [
+        {
+            "type": "paragraph",
+            "content": "Return completed form and executed originals to the LEGAL DEPARTMENT.",
+            "page": "36"
+        },
+        # Add more entries as in your JSON
+    ]
+    # Initialize pipeline
+    pipeline = EmbeddingPipeline("config.yaml")
+    # Load and process data
+    pipeline.load_data(json_data)
+    pipeline.generate_embeddings()
+    # Query and get results
+    results = pipeline.process_queries()
+    print(json.dumps(results, indent=2))

query_processor.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import numpy as np
+from typing import List, Dict, Any
+class QueryProcessor:
+    def __init__(self, sentence_model, openai_client, chroma_collection, faiss_index):
+        self.sentence_model = sentence_model
+        self.openai_client = openai_client
+        self.chroma_collection = chroma_collection
+        self.faiss_index = faiss_index
+    def query(self, prompt: str, documents: List[Dict], embedding_method: str, top_k: int,
+              sentence_embeddings: np.ndarray, openai_embeddings: np.ndarray) -> List[Dict]:
+        """Query using the specified embedding method."""
+        if embedding_method == 'sentence_transformer':
+            query_embedding = self.sentence_model.encode([prompt])[0]
+            distances = np.linalg.norm(sentence_embeddings - query_embedding, axis=1)
+            indices = np.argsort(distances)[:top_k]
+            return [
+                {'id': documents[i]['id'], 'content': documents[i]['content'], 'score': float(distances[i])}
+                for i in indices
+            ]
+        elif embedding_method == 'openai':
+            query_embedding = self.openai_client.embeddings.create(
+                input=[prompt],
+                model="text-embedding-ada-002"
+            ).data[0].embedding
+            distances = np.linalg.norm(np.array(openai_embeddings) - query_embedding, axis=1)
+            indices = np.argsort(distances)[:top_k]
+            return [
+                {'id': documents[i]['id'], 'content': documents[i]['content'], 'score': float(distances[i])}
+                for i in indices
+            ]
+        elif embedding_method == 'chroma':
+            results = self.chroma_collection.query(
+                query_texts=[prompt],
+                n_results=top_k
+            )
+            return [
+                {'id': id, 'content': text, 'score': dist}
+                for id, text, dist in zip(results['ids'][0], results['documents'][0], results['distances'][0])
+            ]
+        elif embedding_method == 'faiss':
+            query_embedding = self.sentence_model.encode([prompt])[0]
+            distances, indices = self.faiss_index.search(np.array([query_embedding]), top_k)
+            return [
+                {'id': documents[i]['id'], 'content': documents[i]['content'], 'score': float(distances[0][j])}
+                for j, i in enumerate(indices[0])
+            ]
+        else:
+            raise ValueError(f"Unsupported embedding method: {embedding_method}")

reranker.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from rank_bm25 import BM25Okapi
+from typing import List, Dict
+class Reranker:
+    def __init__(self):
+        self.tokenized_docs = []
+        self.bm25 = None
+    def prepare(self, documents: List[Dict]):
+        """Prepare the reranker with documents."""
+        self.tokenized_docs = [doc['content'].split() for doc in documents]
+        self.bm25 = BM25Okapi(self.tokenized_docs)
+    def rerank(self, query: str, initial_results: List[Dict], top_k: int) -> List[Dict]:
+        """Rerank initial search results using BM25."""
+        tokenized_query = query.split()
+        scores = self.bm25.get_scores(tokenized_query)
+        # Combine initial scores with BM25 scores
+        reranked = []
+        for idx, result in enumerate(initial_results):
+            doc_idx = int(result['id'].split('_')[1])
+            combined_score = result['score'] + scores[doc_idx]
+            reranked.append({
+                'id': result['id'],
+                'content': result['content'],
+                'score': combined_score
+            })
+        # Sort by combined score
+        reranked.sort(key=lambda x: x['score'], reverse=True)
+        return reranked[:top_k]