Kushalguptaiitb commited on
Commit
07bb79d
·
verified ·
1 Parent(s): 29fc451

Upload 5 files

Browse files
Files changed (5) hide show
  1. config.yaml +30 -0
  2. config_loader.py +6 -0
  3. embedding_pipeline.py +133 -0
  4. query_processor.py +54 -0
  5. reranker.py +32 -0
config.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ models:
2
+ sentence_transformer: "all-MiniLM-L6-v2"
3
+ openai: "text-embedding-ada-002"
4
+
5
+ api_keys:
6
+ openai: "your-openai-api-key"
7
+
8
+ chroma:
9
+ collection_name: "documents"
10
+
11
+ faiss:
12
+ dimension: 384 # Should match the SentenceTransformer model output dimension
13
+
14
+ fields:
15
+ legal_department:
16
+ prompt: "Which department should the completed form be returned to?"
17
+ embedding_method: "sentence_transformer"
18
+ top_k: 3
19
+ party_info:
20
+ prompt: "Who are the eligible parties for forwarding collateral?"
21
+ embedding_method: "openai"
22
+ top_k: 2
23
+ contact_info:
24
+ prompt: "What is the contact information for the trading desk?"
25
+ embedding_method: "chroma"
26
+ top_k: 1
27
+ scope:
28
+ prompt: "What is the scope of review mentioned?"
29
+ embedding_method: "faiss"
30
+ top_k: 2
config_loader.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import yaml
2
+
3
+ def load_config(config_path: str) -> dict:
4
+ """Load configuration from a YAML file."""
5
+ with open(config_path, 'r') as f:
6
+ return yaml.safe_load(f)
embedding_pipeline.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List, Dict, Any
3
+ import chromadb
4
+ import faiss
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+ from openai import OpenAI
8
+ from reranker import Reranker
9
+ from query_processor import QueryProcessor
10
+ from config_loader import load_config
11
+
12
+ class EmbeddingPipeline:
13
+ def __init__(self, config_path: str):
14
+ # Load configuration
15
+ self.config = load_config(config_path)
16
+
17
+ # Initialize embedding models
18
+ self.sentence_model = SentenceTransformer(self.config['models']['sentence_transformer'])
19
+ self.openai_client = OpenAI(api_key=self.config['api_keys']['openai'])
20
+
21
+ # Initialize ChromaDB
22
+ self.chroma_client = chromadb.Client()
23
+ self.chroma_collection = self.chroma_client.create_collection(
24
+ name=self.config['chroma']['collection_name']
25
+ )
26
+
27
+ # Initialize FAISS
28
+ self.dimension = self.config['faiss']['dimension']
29
+ self.faiss_index = faiss.IndexFlatL2(self.dimension)
30
+
31
+ # Store documents and embeddings
32
+ self.documents = []
33
+ self.chroma_embeddings = []
34
+ self.faiss_embeddings = []
35
+ self.openai_embeddings = []
36
+ self.sentence_embeddings = []
37
+
38
+ # Initialize reranker and query processor
39
+ self.reranker = Reranker()
40
+ self.query_processor = QueryProcessor(
41
+ sentence_model=self.sentence_model,
42
+ openai_client=self.openai_client,
43
+ chroma_collection=self.chroma_collection,
44
+ faiss_index=self.faiss_index
45
+ )
46
+
47
+ def load_data(self, json_data: Dict) -> List[Dict]:
48
+ """Load and flatten JSON data into documents."""
49
+ documents = []
50
+ for item in json_data:
51
+ doc = {
52
+ 'content': item['content'],
53
+ 'type': item['type'],
54
+ 'page': item['page'],
55
+ 'id': f"doc_{len(documents)}"
56
+ }
57
+ documents.append(doc)
58
+ self.documents = documents
59
+ return documents
60
+
61
+ def generate_embeddings(self):
62
+ """Generate embeddings using all methods."""
63
+ texts = [doc['content'] for doc in self.documents]
64
+
65
+ # Sentence Transformer embeddings
66
+ self.sentence_embeddings = self.sentence_model.encode(texts)
67
+
68
+ # OpenAI embeddings
69
+ openai_response = self.openai_client.embeddings.create(
70
+ input=texts,
71
+ model=self.config['models']['openai']
72
+ )
73
+ self.openai_embeddings = [embedding.embedding for embedding in openai_response.data]
74
+
75
+ # ChromaDB embeddings
76
+ self.chroma_collection.add(
77
+ documents=texts,
78
+ ids=[doc['id'] for doc in self.documents]
79
+ )
80
+ self.chroma_embeddings = self.chroma_collection.get(include=['embeddings'])['embeddings']
81
+
82
+ # FAISS embeddings (using Sentence Transformer embeddings for FAISS)
83
+ self.faiss_embeddings = np.array(self.sentence_embeddings)
84
+ self.faiss_index.add(self.faiss_embeddings)
85
+
86
+ # Prepare reranker
87
+ self.reranker.prepare(self.documents)
88
+
89
+ def process_queries(self) -> Dict[str, List[Dict]]:
90
+ """Process all configured field queries."""
91
+ results = {}
92
+ for field, settings in self.config['fields'].items():
93
+ prompt = settings['prompt']
94
+ top_k = settings['top_k']
95
+ method = settings['embedding_method']
96
+
97
+ # Query using the appropriate method
98
+ initial_results = self.query_processor.query(
99
+ prompt=prompt,
100
+ documents=self.documents,
101
+ embedding_method=method,
102
+ top_k=top_k,
103
+ sentence_embeddings=self.sentence_embeddings,
104
+ openai_embeddings=self.openai_embeddings
105
+ )
106
+
107
+ # Rerank results
108
+ reranked_results = self.reranker.rerank(prompt, initial_results, top_k)
109
+ results[field] = reranked_results
110
+ return results
111
+
112
+ # Example usage
113
+ if __name__ == "__main__":
114
+ # Sample JSON data (replace with actual input)
115
+ json_data = [
116
+ {
117
+ "type": "paragraph",
118
+ "content": "Return completed form and executed originals to the LEGAL DEPARTMENT.",
119
+ "page": "36"
120
+ },
121
+ # Add more entries as in your JSON
122
+ ]
123
+
124
+ # Initialize pipeline
125
+ pipeline = EmbeddingPipeline("config.yaml")
126
+
127
+ # Load and process data
128
+ pipeline.load_data(json_data)
129
+ pipeline.generate_embeddings()
130
+
131
+ # Query and get results
132
+ results = pipeline.process_queries()
133
+ print(json.dumps(results, indent=2))
query_processor.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List, Dict, Any
3
+
4
+ class QueryProcessor:
5
+ def __init__(self, sentence_model, openai_client, chroma_collection, faiss_index):
6
+ self.sentence_model = sentence_model
7
+ self.openai_client = openai_client
8
+ self.chroma_collection = chroma_collection
9
+ self.faiss_index = faiss_index
10
+
11
+ def query(self, prompt: str, documents: List[Dict], embedding_method: str, top_k: int,
12
+ sentence_embeddings: np.ndarray, openai_embeddings: np.ndarray) -> List[Dict]:
13
+ """Query using the specified embedding method."""
14
+ if embedding_method == 'sentence_transformer':
15
+ query_embedding = self.sentence_model.encode([prompt])[0]
16
+ distances = np.linalg.norm(sentence_embeddings - query_embedding, axis=1)
17
+ indices = np.argsort(distances)[:top_k]
18
+ return [
19
+ {'id': documents[i]['id'], 'content': documents[i]['content'], 'score': float(distances[i])}
20
+ for i in indices
21
+ ]
22
+
23
+ elif embedding_method == 'openai':
24
+ query_embedding = self.openai_client.embeddings.create(
25
+ input=[prompt],
26
+ model="text-embedding-ada-002"
27
+ ).data[0].embedding
28
+ distances = np.linalg.norm(np.array(openai_embeddings) - query_embedding, axis=1)
29
+ indices = np.argsort(distances)[:top_k]
30
+ return [
31
+ {'id': documents[i]['id'], 'content': documents[i]['content'], 'score': float(distances[i])}
32
+ for i in indices
33
+ ]
34
+
35
+ elif embedding_method == 'chroma':
36
+ results = self.chroma_collection.query(
37
+ query_texts=[prompt],
38
+ n_results=top_k
39
+ )
40
+ return [
41
+ {'id': id, 'content': text, 'score': dist}
42
+ for id, text, dist in zip(results['ids'][0], results['documents'][0], results['distances'][0])
43
+ ]
44
+
45
+ elif embedding_method == 'faiss':
46
+ query_embedding = self.sentence_model.encode([prompt])[0]
47
+ distances, indices = self.faiss_index.search(np.array([query_embedding]), top_k)
48
+ return [
49
+ {'id': documents[i]['id'], 'content': documents[i]['content'], 'score': float(distances[0][j])}
50
+ for j, i in enumerate(indices[0])
51
+ ]
52
+
53
+ else:
54
+ raise ValueError(f"Unsupported embedding method: {embedding_method}")
reranker.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rank_bm25 import BM25Okapi
2
+ from typing import List, Dict
3
+
4
+ class Reranker:
5
+ def __init__(self):
6
+ self.tokenized_docs = []
7
+ self.bm25 = None
8
+
9
+ def prepare(self, documents: List[Dict]):
10
+ """Prepare the reranker with documents."""
11
+ self.tokenized_docs = [doc['content'].split() for doc in documents]
12
+ self.bm25 = BM25Okapi(self.tokenized_docs)
13
+
14
+ def rerank(self, query: str, initial_results: List[Dict], top_k: int) -> List[Dict]:
15
+ """Rerank initial search results using BM25."""
16
+ tokenized_query = query.split()
17
+ scores = self.bm25.get_scores(tokenized_query)
18
+
19
+ # Combine initial scores with BM25 scores
20
+ reranked = []
21
+ for idx, result in enumerate(initial_results):
22
+ doc_idx = int(result['id'].split('_')[1])
23
+ combined_score = result['score'] + scores[doc_idx]
24
+ reranked.append({
25
+ 'id': result['id'],
26
+ 'content': result['content'],
27
+ 'score': combined_score
28
+ })
29
+
30
+ # Sort by combined score
31
+ reranked.sort(key=lambda x: x['score'], reverse=True)
32
+ return reranked[:top_k]