Spaces:

arbabarshad
/

agllm2-dev

Sleeping

App Files Files Community

arbabarshad commited on Sep 29, 2025

Commit

f7bad94

1 Parent(s): f3be97d

starting sep 29 2

Browse files

Files changed (6) hide show

README.md +8 -1
app.py +2 -1
retrieval_evaluation.py +313 -0
retrieval_evaluation_results.json +130 -0
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 +1 -1
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/e82d58e5-16f1-41a6-9289-211464329861/length.bin +1 -1

README.md CHANGED Viewed

@@ -43,4 +43,11 @@ This repository encountered several Git LFS issues during setup. Here's a summar
 *   Pushing branches with problematic LFS history to a fresh remote can fail. Starting the remote with a clean, history-free branch is a workaround.
 *   When adding LFS tracking for existing binary files via `.gitattributes`, ensure the commit correctly converts files to LFS pointers. `git add --renormalize .` after updating `.gitattributes` and *before* committing is often necessary.
-*   Double-check `.gitignore` if expected files or directories are missing after a `git add .`.

 *   Pushing branches with problematic LFS history to a fresh remote can fail. Starting the remote with a clean, history-free branch is a workaround.
 *   When adding LFS tracking for existing binary files via `.gitattributes`, ensure the commit correctly converts files to LFS pointers. `git add --renormalize .` after updating `.gitattributes` and *before* committing is often necessary.
+*   Double-check `.gitignore` if expected files or directories are missing after a `git add .`.
+while running in claude code :
+source ~/miniconda3/etc/profile.d/conda.sh && conda activate agthinker
+run command like example:  source ~/miniconda3/etc/profile.d/conda.sh && conda activate agllm-env1-updates-1 &&         │
+│   python whatebverscriptis.py

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 # https://stackoverflow.com/questions/76175046/how-to-add-prompt-to-langchain-conversationalretrievalchain-chat-over-docs-with
-# again from:
 # https://python.langchain.com/docs/integrations/providers/vectara/vectara_chat
 from langchain.document_loaders import PyPDFDirectoryLoader
 import pandas as pd

+# hello world
 import os
 # https://stackoverflow.com/questions/76175046/how-to-add-prompt-to-langchain-conversationalretrievalchain-chat-over-docs-with
+# again from:
 # https://python.langchain.com/docs/integrations/providers/vectara/vectara_chat
 from langchain.document_loaders import PyPDFDirectoryLoader
 import pandas as pd

retrieval_evaluation.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""
+Retrieval Evaluation Script for AgLLM
+Generates questions from chunks and evaluates retrieval performance with precision@k and nDCG@k metrics
+"""
+import os
+import json
+import random
+import numpy as np
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass
+import pandas as pd
+from tqdm import tqdm
+from langchain.vectorstores import Chroma
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.schema import Document
+import openai
+from dotenv import load_dotenv
+import time
+load_dotenv()
+@dataclass
+class EvaluationSample:
+    """Holds a chunk, its generated question, and metadata"""
+    chunk_id: str
+    chunk_content: str
+    metadata: Dict
+    question: str
+    ground_truth_chunk_id: str  # The chunk that contains the answer
+class QuestionGenerator:
+    """Generates questions from chunks using GPT-4"""
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        if not self.api_key:
+            raise ValueError("OpenAI API key not found")
+    def generate_question(self, chunk_content: str, metadata: Dict) -> str:
+        """Generate a question where the chunk contains the answer"""
+        # Build context from metadata
+        context_parts = []
+        if 'species' in metadata:
+            context_parts.append(f"Species: {metadata['species']}")
+        if 'common_name' in metadata:
+            context_parts.append(f"Common Name: {metadata['common_name']}")
+        if 'region' in metadata:
+            context_parts.append(f"Region: {metadata['region']}")
+        context = " | ".join(context_parts) if context_parts else ""
+        prompt = f"""Given the following agricultural information chunk, generate ONE specific question that this chunk directly answers.
+The question should be natural and the kind a farmer or agricultural expert might ask.
+The answer to your question MUST be found in the provided chunk.
+Context: {context}
+Chunk Content:
+{chunk_content[:1500]}  # Limit chunk size for prompt
+Generate a single, clear question (no explanations, just the question):"""
+        try:
+            from openai import OpenAI
+            client = OpenAI(api_key=self.api_key)
+            response = client.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {"role": "system", "content": "You are an agricultural expert who creates precise questions from agricultural information."},
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=100,
+                temperature=0.7
+            )
+            question = response.choices[0].message.content.strip()
+            return question
+        except Exception as e:
+            print(f"Error generating question: {e}")
+            # Fallback question
+            species = metadata.get('species', 'this species')
+            return f"What IPM information is available for {species}?"
+class RetrievalEvaluator:
+    """Evaluates retrieval performance"""
+    def __init__(self, persist_directory: str, embedding_model = None):
+        self.persist_directory = persist_directory
+        self.embedding = embedding_model or OpenAIEmbeddings()
+        self.vectordb = Chroma(
+            persist_directory=persist_directory,
+            embedding_function=self.embedding
+        )
+    def retrieve_chunks(self, query: str, k: int = 5, filter_dict: Optional[Dict] = None) -> List[Tuple[Document, float]]:
+        """Retrieve top-k chunks for a query with optional metadata filtering"""
+        if filter_dict:
+            results = self.vectordb.similarity_search_with_score(
+                query,
+                k=k,
+                filter=filter_dict
+            )
+        else:
+            results = self.vectordb.similarity_search_with_score(query, k=k)
+        return results
+    def calculate_precision_at_k(self, retrieved_ids: List[str], ground_truth_id: str, k: int) -> float:
+        """Calculate precision@k - binary: 1 if ground truth in top-k, 0 otherwise"""
+        retrieved_at_k = retrieved_ids[:k]
+        return 1.0 if ground_truth_id in retrieved_at_k else 0.0
+    def calculate_ndcg_at_k(self, retrieved_ids: List[str], ground_truth_id: str, k: int) -> float:
+        """Calculate nDCG@k - gives credit for ranking ground truth higher"""
+        dcg = 0.0
+        for i, chunk_id in enumerate(retrieved_ids[:k]):
+            if chunk_id == ground_truth_id:
+                # Relevance is 1 for ground truth, 0 for others
+                dcg += 1.0 / np.log2(i + 2)  # i+2 because positions start at 1
+                break
+        # Ideal DCG is 1.0 at position 1
+        idcg = 1.0
+        return dcg / idcg if idcg > 0 else 0.0
+    def evaluate_retrieval_pipelines(self, samples: List[EvaluationSample], k_values: List[int] = [1, 3, 5]) -> Dict:
+        """Evaluate different retrieval pipelines"""
+        results = {
+            'no_filter': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values},
+            'species_only': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values},
+            'region_only': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values},
+            'species_and_region': {f'precision@{k}': [] for k in k_values} | {f'ndcg@{k}': [] for k in k_values}
+        }
+        for sample in tqdm(samples, desc="Evaluating samples"):
+            question = sample.question
+            ground_truth_id = sample.ground_truth_chunk_id
+            metadata = sample.metadata
+            # Define filter strategies (using ChromaDB filter format)
+            filters = {
+                'no_filter': None,
+                'species_only': {'species': {'$eq': metadata['species']}} if 'species' in metadata else None,
+                'region_only': {'region': {'$eq': metadata['region']}} if 'region' in metadata else None,
+                'species_and_region': {
+                    '$and': [
+                        {'species': {'$eq': metadata['species']}},
+                        {'region': {'$eq': metadata['region']}}
+                    ]
+                } if 'species' in metadata and 'region' in metadata else None
+            }
+            for filter_name, filter_dict in filters.items():
+                # Skip if required metadata is missing
+                if filter_name != 'no_filter' and filter_dict is None:
+                    continue
+                # Retrieve chunks
+                max_k = max(k_values)
+                retrieved_results = self.retrieve_chunks(question, k=max_k, filter_dict=filter_dict)
+                # Extract chunk IDs from results
+                retrieved_ids = []
+                for doc, score in retrieved_results:
+                    # Extract chunk ID from source metadata
+                    source = doc.metadata.get('source', '')
+                    retrieved_ids.append(source)
+                # Calculate metrics for each k
+                for k in k_values:
+                    precision = self.calculate_precision_at_k(retrieved_ids, ground_truth_id, k)
+                    ndcg = self.calculate_ndcg_at_k(retrieved_ids, ground_truth_id, k)
+                    results[filter_name][f'precision@{k}'].append(precision)
+                    results[filter_name][f'ndcg@{k}'].append(ndcg)
+        # Calculate averages
+        averaged_results = {}
+        for pipeline, metrics in results.items():
+            averaged_results[pipeline] = {}
+            for metric_name, values in metrics.items():
+                if values:  # Only calculate if we have values
+                    averaged_results[pipeline][metric_name] = {
+                        'mean': np.mean(values),
+                        'std': np.std(values),
+                        'count': len(values)
+                    }
+        return averaged_results
+def load_chunks_from_vectordb(persist_directory: str, sample_size: Optional[int] = None) -> List[Dict]:
+    """Load chunks from Chroma vectorDB"""
+    embeddings = OpenAIEmbeddings()
+    vectordb = Chroma(
+        persist_directory=persist_directory,
+        embedding_function=embeddings
+    )
+    # Get all documents
+    # Note: Chroma doesn't have a direct way to get all docs, so we use a large search
+    results = vectordb.similarity_search("", k=10000)  # Get many results
+    chunks = []
+    for doc in results:
+        chunk_data = {
+            'id': doc.metadata.get('source', ''),
+            'content': doc.page_content,
+            'metadata': doc.metadata
+        }
+        chunks.append(chunk_data)
+    if sample_size and len(chunks) > sample_size:
+        chunks = random.sample(chunks, sample_size)
+    return chunks
+def main():
+    """Main evaluation pipeline"""
+    # Configuration
+    VECTOR_DB_PATH = 'vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
+    SAMPLE_SIZE = 20   # Start with smaller sample for testing
+    K_VALUES = [1, 3, 5]
+    OUTPUT_FILE = 'retrieval_evaluation_results.json'
+    print("Starting Retrieval Evaluation Pipeline")
+    print("=" * 50)
+    # Step 1: Load chunks from vector database
+    print("\n1. Loading chunks from vector database...")
+    chunks = load_chunks_from_vectordb(VECTOR_DB_PATH, sample_size=SAMPLE_SIZE)
+    print(f"   Loaded {len(chunks)} chunks")
+    # Step 2: Generate questions for chunks
+    print("\n2. Generating questions from chunks...")
+    question_generator = QuestionGenerator()
+    samples = []
+    for i, chunk in enumerate(tqdm(chunks, desc="Generating questions")):
+        try:
+            question = question_generator.generate_question(
+                chunk['content'],
+                chunk['metadata']
+            )
+            sample = EvaluationSample(
+                chunk_id=chunk['id'],
+                chunk_content=chunk['content'],
+                metadata=chunk['metadata'],
+                question=question,
+                ground_truth_chunk_id=chunk['id']
+            )
+            samples.append(sample)
+            # Rate limiting for API
+            if (i + 1) % 10 == 0:
+                time.sleep(1)
+        except Exception as e:
+            print(f"   Error processing chunk {i}: {e}")
+            continue
+    print(f"   Generated {len(samples)} question-chunk pairs")
+    # Step 3: Evaluate retrieval pipelines
+    print("\n3. Evaluating retrieval pipelines...")
+    evaluator = RetrievalEvaluator(VECTOR_DB_PATH)
+    results = evaluator.evaluate_retrieval_pipelines(samples, k_values=K_VALUES)
+    # Step 4: Display and save results
+    print("\n4. Evaluation Results:")
+    print("=" * 50)
+    for pipeline_name, metrics in results.items():
+        print(f"\n{pipeline_name.upper()} Pipeline:")
+        for metric_name, values in metrics.items():
+            if isinstance(values, dict):
+                mean = values['mean']
+                std = values['std']
+                print(f"   {metric_name}: {mean:.3f} ± {std:.3f}")
+    # Save detailed results
+    with open(OUTPUT_FILE, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"\nDetailed results saved to {OUTPUT_FILE}")
+    # Generate comparison statement for paper
+    print("\n" + "=" * 50)
+    print("RESULTS SUMMARY FOR PAPER:")
+    print("=" * 50)
+    baseline = results.get('no_filter', {})
+    species_region = results.get('species_and_region', {})
+    if baseline and species_region:
+        for k in K_VALUES:
+            precision_baseline = baseline.get(f'precision@{k}', {}).get('mean', 0)
+            precision_filtered = species_region.get(f'precision@{k}', {}).get('mean', 0)
+            ndcg_baseline = baseline.get(f'ndcg@{k}', {}).get('mean', 0)
+            ndcg_filtered = species_region.get(f'ndcg@{k}', {}).get('mean', 0)
+            precision_improvement = ((precision_filtered - precision_baseline) / precision_baseline * 100) if precision_baseline > 0 else 0
+            ndcg_improvement = ((ndcg_filtered - ndcg_baseline) / ndcg_baseline * 100) if ndcg_baseline > 0 else 0
+            print(f"\nCompared to a region-agnostic baseline, precision@{k} improves from {precision_baseline:.3f} "
+                  f"to {precision_filtered:.3f} ({precision_improvement:+.1f}%) and nDCG@{k} from {ndcg_baseline:.3f} "
+                  f"to {ndcg_filtered:.3f} ({ndcg_improvement:+.1f}%) when using species and region filters.")
+if __name__ == "__main__":
+    main()

retrieval_evaluation_results.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+  "no_filter": {
+    "precision@1": {
+      "mean": 0.75,
+      "std": 0.4330127018922193,
+      "count": 20
+    },
+    "precision@3": {
+      "mean": 0.95,
+      "std": 0.21794494717703372,
+      "count": 20
+    },
+    "precision@5": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 20
+    },
+    "ndcg@1": {
+      "mean": 0.75,
+      "std": 0.4330127018922193,
+      "count": 20
+    },
+    "ndcg@3": {
+      "mean": 0.8696394630357187,
+      "std": 0.2567840676954238,
+      "count": 20
+    },
+    "ndcg@5": {
+      "mean": 0.8911732909393884,
+      "std": 0.19311947983364772,
+      "count": 20
+    }
+  },
+  "species_only": {
+    "precision@1": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "precision@3": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "precision@5": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "ndcg@1": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "ndcg@3": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "ndcg@5": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    }
+  },
+  "region_only": {
+    "precision@1": {
+      "mean": 0.75,
+      "std": 0.4330127018922193,
+      "count": 20
+    },
+    "precision@3": {
+      "mean": 0.95,
+      "std": 0.21794494717703372,
+      "count": 20
+    },
+    "precision@5": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 20
+    },
+    "ndcg@1": {
+      "mean": 0.75,
+      "std": 0.4330127018922193,
+      "count": 20
+    },
+    "ndcg@3": {
+      "mean": 0.8696394630357187,
+      "std": 0.2567840676954238,
+      "count": 20
+    },
+    "ndcg@5": {
+      "mean": 0.8911732909393884,
+      "std": 0.19311947983364772,
+      "count": 20
+    }
+  },
+  "species_and_region": {
+    "precision@1": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "precision@3": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "precision@5": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "ndcg@1": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "ndcg@3": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    },
+    "ndcg@5": {
+      "mean": 1.0,
+      "std": 0.0,
+      "count": 5
+    }
+  }
+}

vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:afd345c37b027282fda52059468b08f145bafeb1c22e0b5a5678258aadc1f22e
 size 9072640

 version https://git-lfs.github.com/spec/v1
+oid sha256:0323fbf65a7d0d8cfbad75ed514829fc5d979a0d89603c61f511ed46c87dd69e
 size 9072640

vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/e82d58e5-16f1-41a6-9289-211464329861/length.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7e2dcff542de95352682dc186432e98f0188084896773f1973276b0577d5305
 size 40000

 version https://git-lfs.github.com/spec/v1
+oid sha256:b0eca7ce2600dfc137188f7b969056d2155f188796a248ab9b3b78f60431df7e
 size 40000