arbabarshad commited on
Commit
f7bad94
·
1 Parent(s): f3be97d

starting sep 29 2

Browse files
README.md CHANGED
@@ -43,4 +43,11 @@ This repository encountered several Git LFS issues during setup. Here's a summar
43
 
44
  * Pushing branches with problematic LFS history to a fresh remote can fail. Starting the remote with a clean, history-free branch is a workaround.
45
  * When adding LFS tracking for existing binary files via `.gitattributes`, ensure the commit correctly converts files to LFS pointers. `git add --renormalize .` after updating `.gitattributes` and *before* committing is often necessary.
46
- * Double-check `.gitignore` if expected files or directories are missing after a `git add .`.
 
 
 
 
 
 
 
 
43
 
44
  * Pushing branches with problematic LFS history to a fresh remote can fail. Starting the remote with a clean, history-free branch is a workaround.
45
  * When adding LFS tracking for existing binary files via `.gitattributes`, ensure the commit correctly converts files to LFS pointers. `git add --renormalize .` after updating `.gitattributes` and *before* committing is often necessary.
46
+ * Double-check `.gitignore` if expected files or directories are missing after a `git add .`.
47
+
48
+
49
+ while running in claude code :
50
+ source ~/miniconda3/etc/profile.d/conda.sh && conda activate agthinker
51
+
52
+ run command like example: source ~/miniconda3/etc/profile.d/conda.sh && conda activate agllm-env1-updates-1 && │
53
+ │ python whatebverscriptis.py
app.py CHANGED
@@ -1,6 +1,7 @@
 
1
  import os
2
  # https://stackoverflow.com/questions/76175046/how-to-add-prompt-to-langchain-conversationalretrievalchain-chat-over-docs-with
3
- # again from:
4
  # https://python.langchain.com/docs/integrations/providers/vectara/vectara_chat
5
  from langchain.document_loaders import PyPDFDirectoryLoader
6
  import pandas as pd
 
1
+ # hello world
2
  import os
3
  # https://stackoverflow.com/questions/76175046/how-to-add-prompt-to-langchain-conversationalretrievalchain-chat-over-docs-with
4
+ # again from:
5
  # https://python.langchain.com/docs/integrations/providers/vectara/vectara_chat
6
  from langchain.document_loaders import PyPDFDirectoryLoader
7
  import pandas as pd
retrieval_evaluation.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Retrieval Evaluation Script for AgLLM
3
+ Generates questions from chunks and evaluates retrieval performance with precision@k and nDCG@k metrics
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import random
9
+ import numpy as np
10
+ from typing import List, Dict, Tuple, Optional
11
+ from dataclasses import dataclass
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+ from langchain.vectorstores import Chroma
15
+ from langchain.embeddings import OpenAIEmbeddings
16
+ from langchain.schema import Document
17
+ import openai
18
+ from dotenv import load_dotenv
19
+ import time
20
+
21
+ load_dotenv()
22
+
23
+ @dataclass
24
+ class EvaluationSample:
25
+ """Holds a chunk, its generated question, and metadata"""
26
+ chunk_id: str
27
+ chunk_content: str
28
+ metadata: Dict
29
+ question: str
30
+ ground_truth_chunk_id: str # The chunk that contains the answer
31
+
32
+ class QuestionGenerator:
33
+ """Generates questions from chunks using GPT-4"""
34
+
35
+ def __init__(self, api_key: Optional[str] = None):
36
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
37
+ if not self.api_key:
38
+ raise ValueError("OpenAI API key not found")
39
+
40
+ def generate_question(self, chunk_content: str, metadata: Dict) -> str:
41
+ """Generate a question where the chunk contains the answer"""
42
+
43
+ # Build context from metadata
44
+ context_parts = []
45
+ if 'species' in metadata:
46
+ context_parts.append(f"Species: {metadata['species']}")
47
+ if 'common_name' in metadata:
48
+ context_parts.append(f"Common Name: {metadata['common_name']}")
49
+ if 'region' in metadata:
50
+ context_parts.append(f"Region: {metadata['region']}")
51
+
52
+ context = " | ".join(context_parts) if context_parts else ""
53
+
54
+ prompt = f"""Given the following agricultural information chunk, generate ONE specific question that this chunk directly answers.
55
+ The question should be natural and the kind a farmer or agricultural expert might ask.
56
+ The answer to your question MUST be found in the provided chunk.
57
+
58
+ Context: {context}
59
+
60
+ Chunk Content:
61
+ {chunk_content[:1500]} # Limit chunk size for prompt
62
+
63
+ Generate a single, clear question (no explanations, just the question):"""
64
+
65
+ try:
66
+ from openai import OpenAI
67
+ client = OpenAI(api_key=self.api_key)
68
+ response = client.chat.completions.create(
69
+ model="gpt-4o",
70
+ messages=[
71
+ {"role": "system", "content": "You are an agricultural expert who creates precise questions from agricultural information."},
72
+ {"role": "user", "content": prompt}
73
+ ],
74
+ max_tokens=100,
75
+ temperature=0.7
76
+ )
77
+ question = response.choices[0].message.content.strip()
78
+ return question
79
+ except Exception as e:
80
+ print(f"Error generating question: {e}")
81
+ # Fallback question
82
+ species = metadata.get('species', 'this species')
83
+ return f"What IPM information is available for {species}?"
84
+
85
+ class RetrievalEvaluator:
86
+ """Evaluates retrieval performance"""
87
+
88
+ def __init__(self, persist_directory: str, embedding_model = None):
89
+ self.persist_directory = persist_directory
90
+ self.embedding = embedding_model or OpenAIEmbeddings()
91
+ self.vectordb = Chroma(
92
+ persist_directory=persist_directory,
93
+ embedding_function=self.embedding
94
+ )
95
+
96
+ def retrieve_chunks(self, query: str, k: int = 5, filter_dict: Optional[Dict] = None) -> List[Tuple[Document, float]]:
97
+ """Retrieve top-k chunks for a query with optional metadata filtering"""
98
+ if filter_dict:
99
+ results = self.vectordb.similarity_search_with_score(
100
+ query,
101
+ k=k,
102
+ filter=filter_dict
103
+ )
104
+ else:
105
+ results = self.vectordb.similarity_search_with_score(query, k=k)
106
+ return results
107
+
108
+ def calculate_precision_at_k(self, retrieved_ids: List[str], ground_truth_id: str, k: int) -> float:
109
+ """Calculate precision@k - binary: 1 if ground truth in top-k, 0 otherwise"""
110
+ retrieved_at_k = retrieved_ids[:k]
111
+ return 1.0 if ground_truth_id in retrieved_at_k else 0.0
112
+
113
+ def calculate_ndcg_at_k(self, retrieved_ids: List[str], ground_truth_id: str, k: int) -> float:
114
+ """Calculate nDCG@k - gives credit for ranking ground truth higher"""
115
+ dcg = 0.0
116
+ for i, chunk_id in enumerate(retrieved_ids[:k]):
117
+ if chunk_id == ground_truth_id:
118
+ # Relevance is 1 for ground truth, 0 for others
119
+ dcg += 1.0 / np.log2(i + 2) # i+2 because positions start at 1
120
+ break
121
+
122
+ # Ideal DCG is 1.0 at position 1
123
+ idcg = 1.0
124
+
125
+ return dcg / idcg if idcg > 0 else 0.0
126
+
127
+ def evaluate_retrieval_pipelines(self, samples: List[EvaluationSample], k_values: List[int] = [1, 3, 5]) -> Dict:
128
+ """Evaluate different retrieval pipelines"""
129
+
130
+ results = {
131
+ 'no_filter': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values},
132
+ 'species_only': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values},
133
+ 'region_only': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values},
134
+ 'species_and_region': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values}
135
+ }
136
+
137
+ for sample in tqdm(samples, desc="Evaluating samples"):
138
+ question = sample.question
139
+ ground_truth_id = sample.ground_truth_chunk_id
140
+ metadata = sample.metadata
141
+
142
+ # Define filter strategies (using ChromaDB filter format)
143
+ filters = {
144
+ 'no_filter': None,
145
+ 'species_only': {'species': {'$eq': metadata['species']}} if 'species' in metadata else None,
146
+ 'region_only': {'region': {'$eq': metadata['region']}} if 'region' in metadata else None,
147
+ 'species_and_region': {
148
+ '$and': [
149
+ {'species': {'$eq': metadata['species']}},
150
+ {'region': {'$eq': metadata['region']}}
151
+ ]
152
+ } if 'species' in metadata and 'region' in metadata else None
153
+ }
154
+
155
+ for filter_name, filter_dict in filters.items():
156
+ # Skip if required metadata is missing
157
+ if filter_name != 'no_filter' and filter_dict is None:
158
+ continue
159
+
160
+ # Retrieve chunks
161
+ max_k = max(k_values)
162
+ retrieved_results = self.retrieve_chunks(question, k=max_k, filter_dict=filter_dict)
163
+
164
+ # Extract chunk IDs from results
165
+ retrieved_ids = []
166
+ for doc, score in retrieved_results:
167
+ # Extract chunk ID from source metadata
168
+ source = doc.metadata.get('source', '')
169
+ retrieved_ids.append(source)
170
+
171
+ # Calculate metrics for each k
172
+ for k in k_values:
173
+ precision = self.calculate_precision_at_k(retrieved_ids, ground_truth_id, k)
174
+ ndcg = self.calculate_ndcg_at_k(retrieved_ids, ground_truth_id, k)
175
+
176
+ results[filter_name][f'precision@{k}'].append(precision)
177
+ results[filter_name][f'ndcg@{k}'].append(ndcg)
178
+
179
+ # Calculate averages
180
+ averaged_results = {}
181
+ for pipeline, metrics in results.items():
182
+ averaged_results[pipeline] = {}
183
+ for metric_name, values in metrics.items():
184
+ if values: # Only calculate if we have values
185
+ averaged_results[pipeline][metric_name] = {
186
+ 'mean': np.mean(values),
187
+ 'std': np.std(values),
188
+ 'count': len(values)
189
+ }
190
+
191
+ return averaged_results
192
+
193
+ def load_chunks_from_vectordb(persist_directory: str, sample_size: Optional[int] = None) -> List[Dict]:
194
+ """Load chunks from Chroma vectorDB"""
195
+ embeddings = OpenAIEmbeddings()
196
+ vectordb = Chroma(
197
+ persist_directory=persist_directory,
198
+ embedding_function=embeddings
199
+ )
200
+
201
+ # Get all documents
202
+ # Note: Chroma doesn't have a direct way to get all docs, so we use a large search
203
+ results = vectordb.similarity_search("", k=10000) # Get many results
204
+
205
+ chunks = []
206
+ for doc in results:
207
+ chunk_data = {
208
+ 'id': doc.metadata.get('source', ''),
209
+ 'content': doc.page_content,
210
+ 'metadata': doc.metadata
211
+ }
212
+ chunks.append(chunk_data)
213
+
214
+ if sample_size and len(chunks) > sample_size:
215
+ chunks = random.sample(chunks, sample_size)
216
+
217
+ return chunks
218
+
219
+ def main():
220
+ """Main evaluation pipeline"""
221
+
222
+ # Configuration
223
+ VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
224
+ SAMPLE_SIZE = 20 # Start with smaller sample for testing
225
+ K_VALUES = [1, 3, 5]
226
+ OUTPUT_FILE = 'retrieval_evaluation_results.json'
227
+
228
+ print("Starting Retrieval Evaluation Pipeline")
229
+ print("=" * 50)
230
+
231
+ # Step 1: Load chunks from vector database
232
+ print("\n1. Loading chunks from vector database...")
233
+ chunks = load_chunks_from_vectordb(VECTOR_DB_PATH, sample_size=SAMPLE_SIZE)
234
+ print(f" Loaded {len(chunks)} chunks")
235
+
236
+ # Step 2: Generate questions for chunks
237
+ print("\n2. Generating questions from chunks...")
238
+ question_generator = QuestionGenerator()
239
+ samples = []
240
+
241
+ for i, chunk in enumerate(tqdm(chunks, desc="Generating questions")):
242
+ try:
243
+ question = question_generator.generate_question(
244
+ chunk['content'],
245
+ chunk['metadata']
246
+ )
247
+
248
+ sample = EvaluationSample(
249
+ chunk_id=chunk['id'],
250
+ chunk_content=chunk['content'],
251
+ metadata=chunk['metadata'],
252
+ question=question,
253
+ ground_truth_chunk_id=chunk['id']
254
+ )
255
+ samples.append(sample)
256
+
257
+ # Rate limiting for API
258
+ if (i + 1) % 10 == 0:
259
+ time.sleep(1)
260
+
261
+ except Exception as e:
262
+ print(f" Error processing chunk {i}: {e}")
263
+ continue
264
+
265
+ print(f" Generated {len(samples)} question-chunk pairs")
266
+
267
+ # Step 3: Evaluate retrieval pipelines
268
+ print("\n3. Evaluating retrieval pipelines...")
269
+ evaluator = RetrievalEvaluator(VECTOR_DB_PATH)
270
+ results = evaluator.evaluate_retrieval_pipelines(samples, k_values=K_VALUES)
271
+
272
+ # Step 4: Display and save results
273
+ print("\n4. Evaluation Results:")
274
+ print("=" * 50)
275
+
276
+ for pipeline_name, metrics in results.items():
277
+ print(f"\n{pipeline_name.upper()} Pipeline:")
278
+ for metric_name, values in metrics.items():
279
+ if isinstance(values, dict):
280
+ mean = values['mean']
281
+ std = values['std']
282
+ print(f" {metric_name}: {mean:.3f} ± {std:.3f}")
283
+
284
+ # Save detailed results
285
+ with open(OUTPUT_FILE, 'w') as f:
286
+ json.dump(results, f, indent=2)
287
+
288
+ print(f"\nDetailed results saved to {OUTPUT_FILE}")
289
+
290
+ # Generate comparison statement for paper
291
+ print("\n" + "=" * 50)
292
+ print("RESULTS SUMMARY FOR PAPER:")
293
+ print("=" * 50)
294
+
295
+ baseline = results.get('no_filter', {})
296
+ species_region = results.get('species_and_region', {})
297
+
298
+ if baseline and species_region:
299
+ for k in K_VALUES:
300
+ precision_baseline = baseline.get(f'precision@{k}', {}).get('mean', 0)
301
+ precision_filtered = species_region.get(f'precision@{k}', {}).get('mean', 0)
302
+ ndcg_baseline = baseline.get(f'ndcg@{k}', {}).get('mean', 0)
303
+ ndcg_filtered = species_region.get(f'ndcg@{k}', {}).get('mean', 0)
304
+
305
+ precision_improvement = ((precision_filtered - precision_baseline) / precision_baseline * 100) if precision_baseline > 0 else 0
306
+ ndcg_improvement = ((ndcg_filtered - ndcg_baseline) / ndcg_baseline * 100) if ndcg_baseline > 0 else 0
307
+
308
+ print(f"\nCompared to a region-agnostic baseline, precision@{k} improves from {precision_baseline:.3f} "
309
+ f"to {precision_filtered:.3f} ({precision_improvement:+.1f}%) and nDCG@{k} from {ndcg_baseline:.3f} "
310
+ f"to {ndcg_filtered:.3f} ({ndcg_improvement:+.1f}%) when using species and region filters.")
311
+
312
+ if __name__ == "__main__":
313
+ main()
retrieval_evaluation_results.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "no_filter": {
3
+ "precision@1": {
4
+ "mean": 0.75,
5
+ "std": 0.4330127018922193,
6
+ "count": 20
7
+ },
8
+ "precision@3": {
9
+ "mean": 0.95,
10
+ "std": 0.21794494717703372,
11
+ "count": 20
12
+ },
13
+ "precision@5": {
14
+ "mean": 1.0,
15
+ "std": 0.0,
16
+ "count": 20
17
+ },
18
+ "ndcg@1": {
19
+ "mean": 0.75,
20
+ "std": 0.4330127018922193,
21
+ "count": 20
22
+ },
23
+ "ndcg@3": {
24
+ "mean": 0.8696394630357187,
25
+ "std": 0.2567840676954238,
26
+ "count": 20
27
+ },
28
+ "ndcg@5": {
29
+ "mean": 0.8911732909393884,
30
+ "std": 0.19311947983364772,
31
+ "count": 20
32
+ }
33
+ },
34
+ "species_only": {
35
+ "precision@1": {
36
+ "mean": 1.0,
37
+ "std": 0.0,
38
+ "count": 5
39
+ },
40
+ "precision@3": {
41
+ "mean": 1.0,
42
+ "std": 0.0,
43
+ "count": 5
44
+ },
45
+ "precision@5": {
46
+ "mean": 1.0,
47
+ "std": 0.0,
48
+ "count": 5
49
+ },
50
+ "ndcg@1": {
51
+ "mean": 1.0,
52
+ "std": 0.0,
53
+ "count": 5
54
+ },
55
+ "ndcg@3": {
56
+ "mean": 1.0,
57
+ "std": 0.0,
58
+ "count": 5
59
+ },
60
+ "ndcg@5": {
61
+ "mean": 1.0,
62
+ "std": 0.0,
63
+ "count": 5
64
+ }
65
+ },
66
+ "region_only": {
67
+ "precision@1": {
68
+ "mean": 0.75,
69
+ "std": 0.4330127018922193,
70
+ "count": 20
71
+ },
72
+ "precision@3": {
73
+ "mean": 0.95,
74
+ "std": 0.21794494717703372,
75
+ "count": 20
76
+ },
77
+ "precision@5": {
78
+ "mean": 1.0,
79
+ "std": 0.0,
80
+ "count": 20
81
+ },
82
+ "ndcg@1": {
83
+ "mean": 0.75,
84
+ "std": 0.4330127018922193,
85
+ "count": 20
86
+ },
87
+ "ndcg@3": {
88
+ "mean": 0.8696394630357187,
89
+ "std": 0.2567840676954238,
90
+ "count": 20
91
+ },
92
+ "ndcg@5": {
93
+ "mean": 0.8911732909393884,
94
+ "std": 0.19311947983364772,
95
+ "count": 20
96
+ }
97
+ },
98
+ "species_and_region": {
99
+ "precision@1": {
100
+ "mean": 1.0,
101
+ "std": 0.0,
102
+ "count": 5
103
+ },
104
+ "precision@3": {
105
+ "mean": 1.0,
106
+ "std": 0.0,
107
+ "count": 5
108
+ },
109
+ "precision@5": {
110
+ "mean": 1.0,
111
+ "std": 0.0,
112
+ "count": 5
113
+ },
114
+ "ndcg@1": {
115
+ "mean": 1.0,
116
+ "std": 0.0,
117
+ "count": 5
118
+ },
119
+ "ndcg@3": {
120
+ "mean": 1.0,
121
+ "std": 0.0,
122
+ "count": 5
123
+ },
124
+ "ndcg@5": {
125
+ "mean": 1.0,
126
+ "std": 0.0,
127
+ "count": 5
128
+ }
129
+ }
130
+ }
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:afd345c37b027282fda52059468b08f145bafeb1c22e0b5a5678258aadc1f22e
3
  size 9072640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0323fbf65a7d0d8cfbad75ed514829fc5d979a0d89603c61f511ed46c87dd69e
3
  size 9072640
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/e82d58e5-16f1-41a6-9289-211464329861/length.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7e2dcff542de95352682dc186432e98f0188084896773f1973276b0577d5305
3
  size 40000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0eca7ce2600dfc137188f7b969056d2155f188796a248ab9b3b78f60431df7e
3
  size 40000