Spaces:

lsempe77
/

fcas

Sleeping

App Files Files Community

lsempe commited on Sep 17, 2025

Commit

9c062cd

0 Parent(s):

Clean repo, remove binary history

Browse files

Files changed (21) hide show

.gitattributes +41 -0
.gradio/certificate.pem +31 -0
README.md +97 -0
__pycache__/synthesis_qa_backend.cpython-312.pyc +0 -0
__pycache__/visualisations.cpython-312.pyc +0 -0
app.py +80 -0
app_debug.py +256 -0
app_old.py +56 -0
chunk_metadata.csv +3 -0
config.py +24 -0
data_handler.py +60 -0
gradio_callbacks.py +283 -0
gradio_callbacks_old.py +154 -0
gradio_components.py +244 -0
gradio_components_old.py +263 -0
requirements.txt +9 -0
research_chunks.faiss +3 -0
synthesis_qa_backend.py +715 -0
synthesis_qa_backend_old.py +475 -0
visualisations.py +348 -0
visualisations_old.py +216 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,41 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+chunk_metadata.csv filter=lfs diff=lfs merge=lfs -text
+research_chunks.faiss filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.faiss filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+3ie[[:space:]]colours[[:space:]]proof_edited[[:space:]]03-02-2022.pdf filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+title: Research Q&A System
+emoji: 🔬
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.42.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# 🔬 Advanced Research Q&A System
+An intelligent research synthesis system that provides AI-powered answers across thousands of academic documents with rich metadata analysis.
+## ✨ Features
+### 🤖 AI-Powered Research Synthesis
+- **Semantic Search**: Uses Google's Gemini embeddings to find semantically relevant studies
+- **Multi-Study Synthesis**: Combines findings across multiple research papers
+- **Academic Citations**: Properly formatted references with rich context
+- **Query Intelligence**: Understands methodology, findings, challenges, and recommendation queries
+### 📊 Rich Metadata Analysis
+- **35+ Metadata Fields**: Comprehensive research characteristics
+- **Quality Metrics**: Rigor scores, validation status, methodological sophistication
+- **Geographic Coverage**: Global research with interactive maps
+- **Sector Analysis**: World Bank sectors and subsectors
+### 🌍 Interactive Visualizations
+- **Global Research Map**: See study distribution worldwide
+- **Methodology Dashboard**: Analyze research designs, sample sizes, data collection methods
+- **Sector Distribution**: Understand research focus areas
+- **Advanced Filtering**: Filter by country, sector, year, sample size, RCT status
+## 🚀 How to Use
+### 1. **Ask Questions**
+Enter research questions like:
+- "What methods were used in agricultural research in Yemen?"
+- "How do cash transfer programs impact poverty reduction?"
+- "What are the main challenges in education programs in fragile states?"
+### 2. **Explore Data**
+- View global research distribution on interactive maps
+- Analyze methodology patterns and quality metrics
+- Filter studies by multiple criteria
+### 3. **Get Synthesized Answers**
+Receive comprehensive answers that:
+- Synthesize findings across multiple studies
+- Include proper academic citations
+- Highlight methodological approaches
+- Note sample sizes and study quality
+## 📈 Data Coverage
+- **Geographic**: Studies from fragile and conflict-affected states worldwide
+- **Sectors**: Agriculture, education, health, governance, economics, and more
+- **Methods**: RCTs, observational studies, mixed methods, qualitative research
+- **Quality**: Rigor-scored studies with validation information
+## 🛠️ Technology Stack
+- **AI**: Google Gemini for embeddings and text generation
+- **Search**: FAISS vector database for semantic search
+- **UI**: Gradio for interactive web interface
+- **Visualization**: Plotly and Folium for rich charts and maps
+- **Data**: Pandas for metadata analysis
+## 📊 Example Output
+**Query**: "What methods were used in agricultural research in Yemen?"
+**Answer**: "Across the studies in agricultural development in Yemen, we find three primary methodological approaches with varying rigor scores. Two randomized controlled trials with sample sizes of 1,200 and 800 households employed structured survey instruments and experimental protocols [1, 3]. Community-based participatory research was extensively used in irrigation studies, with rigor scores above 7.5 and validation through multiple data sources [2, 4]..."
+**References**:
+- [1] Smith, J., Ahmed, M. (2023). Participatory Water Management in Rural Communities. Countries: Yemen | Sector: Agriculture (Irrigation) | Design: RCT (n=1,200) | Quality: RCT, Validated, Rigor: 8.2
+## 🔧 Setup for Development
+1. Clone the repository
+2. Install requirements: `pip install -r requirements.txt`
+3. Add your Google API key as environment variable: `GOOGLE_API_KEY`
+4. Place your FAISS index and metadata files in the root directory
+5. Run: `python app.py`
+## 📝 License
+Apache 2.0 License
+## 🤝 Contributing
+Contributions welcome! This system can be extended to other research domains and document collections.
+---

__pycache__/synthesis_qa_backend.cpython-312.pyc ADDED Viewed

Binary file (15.2 kB). View file

__pycache__/visualisations.cpython-312.pyc ADDED Viewed

Binary file (8.8 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# =============================================================================
+# app.py (main entry point)
+# =============================================================================
+import logging
+import os
+# Ensure logs folder exists
+os.makedirs("logs", exist_ok=True)
+# Configure logging
+logging.basicConfig(
+    level=logging.DEBUG,  # or INFO if you want less noise
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    handlers=[
+        logging.FileHandler("logs/app.log", mode="a"),  # Log to file
+        logging.StreamHandler()                         # Still show in Spaces Logs tab
+    ]
+)
+logger = logging.getLogger(__name__)
+# Example usage
+logger.info("App started")
+logger.debug("This is debug info")
+import gradio as gr
+from data_handler import DataHandler
+from gradio_callbacks import GradioCallbacks
+from gradio_components import (
+    create_header, create_qa_tab, create_overview_tab,
+    create_methodology_tab, create_pivot_tab, create_filters_tab, create_about_tab
+)
+def main():
+    # Initialize data
+    data_handler = DataHandler()
+    callbacks = GradioCallbacks(data_handler)
+    data = data_handler.get_data()
+    # Create Gradio app
+    with gr.Blocks(
+        theme=gr.themes.Monochrome(),
+        title="FCAS Research Methods Evidence Mapping",
+        css="""
+        .gradio-container {
+            max-width: 1200px !important;
+        }
+        .main-header {
+            text-align: center;
+            color: white;
+            padding: 2rem;
+            border-radius: 10px;
+            margin-bottom: 2rem;
+        }
+        """
+    ) as app:
+        # Header
+        create_header()
+        with gr.Tabs():
+            create_qa_tab(callbacks)
+            create_overview_tab(callbacks)
+            create_methodology_tab(callbacks)
+            create_pivot_tab(callbacks)
+            create_filters_tab(callbacks, data['countries_list'], data['sectors_list'])
+            create_about_tab()
+    return app
+if __name__ == "__main__":
+    app = main()
+    app.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )

app_debug.py ADDED Viewed

	@@ -0,0 +1,256 @@

+#!/usr/bin/env python3
+"""
+Temporary app.py for debugging Hugging Face Space issues
+Replace your current app.py with this file temporarily
+"""
+import os
+import pandas as pd
+import faiss
+import google.generativeai as genai
+import numpy as np
+import gradio as gr
+def run_all_checks():
+    """Run all diagnostic checks and return results"""
+    results = []
+    def add_result(text):
+        results.append(text)
+        print(text)
+    add_result("🚀 DEBUGGING HUGGING FACE SPACE")
+    add_result("=" * 50)
+    # Environment Check
+    add_result("\n🔍 ENVIRONMENT CHECK")
+    add_result("=" * 30)
+    # Check API key variations
+    api_keys = {
+        "GOOGLE_API_KEY": os.environ.get("GOOGLE_API_KEY"),
+        "gemini_api": os.environ.get("gemini_api"),
+        "GEMINI_API_KEY": os.environ.get("GEMINI_API_KEY"),
+    }
+    found_key = None
+    for key_name, key_value in api_keys.items():
+        if key_value:
+            add_result(f"✅ {key_name}: {key_value[:10]}...")
+            found_key = key_value
+        else:
+            add_result(f"❌ {key_name}: Not found")
+    if not found_key:
+        add_result("❌ No API key found in any expected environment variable")
+        return "\n".join(results)
+    # Test Gemini API
+    try:
+        genai.configure(api_key=found_key)
+        add_result("✅ Gemini API configured successfully")
+    except Exception as e:
+        add_result(f"❌ Gemini API configuration failed: {e}")
+        return "\n".join(results)
+    # File Check
+    add_result("\n📁 FILE CHECK")
+    add_result("=" * 30)
+    add_result(f"Current directory: {os.getcwd()}")
+    add_result(f"Directory contents: {os.listdir('.')}")
+    files_to_check = [
+        "research_chunks.faiss",
+        "chunk_metadata.csv",
+        "requirements.txt"
+    ]
+    all_files_exist = True
+    for file_path in files_to_check:
+        if os.path.exists(file_path):
+            size = os.path.getsize(file_path)
+            add_result(f"✅ {file_path}: {size:,} bytes")
+        else:
+            add_result(f"❌ {file_path}: NOT FOUND")
+            if file_path in ["research_chunks.faiss", "chunk_metadata.csv"]:
+                all_files_exist = False
+    if not all_files_exist:
+        add_result("\n❌ CRITICAL: Missing required data files!")
+        add_result("You need to upload:")
+        add_result("- research_chunks.faiss (FAISS vector index)")
+        add_result("- chunk_metadata.csv (document metadata)")
+        return "\n".join(results)
+    # FAISS Index Check
+    add_result("\n🔍 FAISS INDEX CHECK")
+    add_result("=" * 30)
+    try:
+        index = faiss.read_index("research_chunks.faiss")
+        add_result(f"✅ FAISS index loaded: {index.ntotal:,} vectors")
+        add_result(f"✅ Index dimension: {index.d}")
+        add_result(f"✅ Index type: {type(index).__name__}")
+    except Exception as e:
+        add_result(f"❌ FAISS index loading failed: {e}")
+        return "\n".join(results)
+    # Metadata Check
+    add_result("\n📊 METADATA CHECK")
+    add_result("=" * 30)
+    try:
+        metadata = pd.read_csv("chunk_metadata.csv")
+        add_result(f"✅ Metadata loaded: {len(metadata):,} rows")
+        add_result(f"✅ Columns ({len(metadata.columns)}): {list(metadata.columns)[:5]}...")
+        add_result(f"✅ Unique records: {metadata['record_id'].nunique():,}")
+        # Check for required columns
+        required_cols = ['record_id', 'text', 'title']
+        missing_cols = [col for col in required_cols if col not in metadata.columns]
+        if missing_cols:
+            add_result(f"⚠️  Missing required columns: {missing_cols}")
+        else:
+            add_result("✅ All required columns present")
+        # Show sample data
+        add_result("\n📝 Sample data:")
+        for i, row in metadata.head(2).iterrows():
+            add_result(f"Row {i}: {row.get('title', 'No title')}")
+            add_result(f"  Text preview: {str(row.get('text', 'No text'))[:100]}...")
+    except Exception as e:
+        add_result(f"❌ Metadata loading failed: {e}")
+        return "\n".join(results)
+    # Embedding API Test
+    add_result("\n🧠 EMBEDDING API TEST")
+    add_result("=" * 30)
+    try:
+        test_query = "agricultural research methods"
+        add_result(f"Testing with query: '{test_query}'")
+        embed_result = genai.embed_content(
+            model="models/embedding-001",
+            content=test_query,
+            task_type="retrieval_query"
+        )
+        embedding = np.array([embed_result['embedding']], dtype="float32")
+        add_result(f"✅ Embedding created: shape {embedding.shape}")
+        add_result(f"✅ First 5 values: {embedding[0][:5]}")
+    except Exception as e:
+        add_result(f"❌ Embedding API test failed: {e}")
+        return "\n".join(results)
+    # Full Search Test
+    add_result("\n🔍 FULL SEARCH TEST")
+    add_result("=" * 30)
+    try:
+        distances, indices = index.search(embedding, k=5)
+        add_result(f"✅ Search completed")
+        add_result(f"✅ Indices: {indices[0]}")
+        add_result(f"✅ Distances: {distances[0]}")
+        # Check results
+        valid_indices = [idx for idx in indices[0] if idx != -1 and idx < len(metadata)]
+        add_result(f"✅ Valid results: {len(valid_indices)}/5")
+        if valid_indices:
+            sample_idx = valid_indices[0]
+            sample_row = metadata.iloc[sample_idx]
+            similarity = 1 / (1 + distances[0][0])
+            add_result(f"\n📋 Best match (similarity: {similarity:.3f}):")
+            add_result(f"  Title: {sample_row.get('title', 'N/A')}")
+            add_result(f"  Text: {str(sample_row.get('text', 'N/A'))[:200]}...")
+    except Exception as e:
+        add_result(f"❌ Full search test failed: {e}")
+        return "\n".join(results)
+    # Environment Info
+    add_result("\n🐍 PYTHON ENVIRONMENT")
+    add_result("=" * 30)
+    import sys
+    add_result(f"Python version: {sys.version}")
+    add_result(f"Platform: {sys.platform}")
+    try:
+        import pkg_resources
+        installed = [pkg.project_name for pkg in pkg_resources.working_set]
+        required = ['gradio', 'faiss-cpu', 'google-generativeai', 'pandas', 'numpy', 'plotly']
+        missing = [pkg for pkg in required if pkg not in installed]
+        if missing:
+            add_result(f"⚠️  Missing packages: {missing}")
+        else:
+            add_result("✅ All required packages installed")
+    except:
+        add_result("⚠️  Could not check installed packages")
+    add_result("\n🎉 ALL TESTS COMPLETED!")
+    add_result("\nIf you see this message, your system should be working!")
+    add_result("You can now replace this debug app.py with your original app.py")
+    return "\n".join(results)
+def create_debug_interface():
+    """Create a simple Gradio interface for debugging"""
+    with gr.Blocks(title="Debug Hugging Face Space") as app:
+        gr.HTML("""
+        <div style="text-align: center; padding: 20px; background: linear-gradient(90deg, #ff6b6b, #4ecdc4); color: white; border-radius: 10px;">
+            <h1>🔧 Hugging Face Space Debugger</h1>
+            <p>This will help identify why your search isn't working</p>
+        </div>
+        """)
+        with gr.Row():
+            run_btn = gr.Button("🚀 Run Full Diagnostic", variant="primary", size="lg")
+        with gr.Row():
+            output = gr.Textbox(
+                label="Diagnostic Results",
+                lines=30,
+                max_lines=50,
+                interactive=False,
+                show_copy_button=True
+            )
+        # Auto-run diagnostics on load
+        app.load(run_all_checks, outputs=output)
+        run_btn.click(run_all_checks, outputs=output)
+        gr.HTML("""
+        <div style="margin-top: 20px; padding: 15px; background: #f0f8ff; border-radius: 5px;">
+            <h3>📋 What This Checks:</h3>
+            <ul>
+                <li><strong>API Key:</strong> Verifies Google Gemini API key is set correctly</li>
+                <li><strong>Files:</strong> Checks if FAISS index and metadata CSV exist</li>
+                <li><strong>Data:</strong> Validates file contents and structure</li>
+                <li><strong>Search:</strong> Tests the complete search pipeline</li>
+                <li><strong>Environment:</strong> Verifies Python packages and setup</li>
+            </ul>
+            <p><strong>Next Steps:</strong> Once all tests pass, replace this debug app.py with your original app.py</p>
+        </div>
+        """)
+    return app
+if __name__ == "__main__":
+    # Run diagnostics in console first
+    print("Running initial diagnostics...")
+    run_all_checks()
+    # Launch Gradio interface
+    app = create_debug_interface()
+    app.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )

app_old.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# =============================================================================
+# app.py (main entry point)
+# =============================================================================
+import gradio as gr
+from data_handler import DataHandler
+from gradio_callbacks import GradioCallbacks
+from gradio_components import (
+    create_header, create_qa_tab, create_overview_tab,
+    create_methodology_tab, create_filters_tab, create_about_tab
+)
+def main():
+    # Initialize data
+    data_handler = DataHandler()
+    callbacks = GradioCallbacks(data_handler)
+    data = data_handler.get_data()
+    # Create Gradio app
+    with gr.Blocks(
+        theme=gr.themes.Monochrome(),
+        title="FCAS Research Methods Evidence Mapping",
+        css="""
+        .gradio-container {
+            max-width: 1200px !important;
+        }
+        .main-header {
+            text-align: center;
+            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 2rem;
+            border-radius: 10px;
+            margin-bottom: 2rem;
+        }
+        """
+    ) as app:
+        # Header
+        create_header()
+        with gr.Tabs():
+            create_qa_tab(callbacks)
+            create_overview_tab(callbacks)
+            create_methodology_tab(callbacks)
+            create_filters_tab(callbacks, data['countries_list'], data['sectors_list'])
+            create_about_tab()
+    return app
+if __name__ == "__main__":
+    app = main()
+    app.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )

chunk_metadata.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e78c3fdc52942e2bd98529d920258ccb378a5b2ec2ef82afb6617dbc48d15ae
+size 202374040

config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+# API Configuration
+API_KEY = os.environ.get("GOOGLE_API_KEY", "your_api_key_here")
+INDEX_PATH = "research_chunks.faiss"
+METADATA_PATH = "chunk_metadata.csv"
+# Countries to include in analysis
+SPECIFIC_COUNTRIES = [
+    "Burkina Faso", "Afghanistan", "Mali", "Sudan", "Haiti", "Somalia",
+    "Niger", "Syria", "South Sudan", "Libya", "Palestinian Territories",
+    "Central African Republic", "Iraq", "Nigeria", "Lebanon", "Ethiopia",
+    "Democratic Republic of the Congo", "Cameroon", "Chad", "Mozambique", "Myanmar"
+]
+# UI Configuration
+APP_TITLE = "AI-powered chatbot"
+#APP_DESCRIPTION = "AI synthesis across thousands of research documents"
+# Default values
+DEFAULT_MAX_STUDIES = 6
+DEFAULT_MIN_RELEVANCE = 0.7
+DEFAULT_MIN_YEAR = 2015
+DEFAULT_MAX_YEAR = 2025

data_handler.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import pandas as pd
+import re
+from synthesis_qa_backend import ResearchSynthesizer
+from config import API_KEY, INDEX_PATH, METADATA_PATH, SPECIFIC_COUNTRIES
+class DataHandler:
+    def __init__(self):
+        self.synthesizer = None
+        self.docs_df = pd.DataFrame()
+        self.countries_list = []
+        self.sectors_list = []
+        self.load_data()
+    def load_data(self):
+        """Initialize the research system and load data"""
+        try:
+            self.synthesizer = ResearchSynthesizer(INDEX_PATH, METADATA_PATH, API_KEY)
+            metadata_df = pd.read_csv(METADATA_PATH)
+            self.docs_df = metadata_df.drop_duplicates(subset=['record_id'])
+            print(f"✅ Loaded {len(self.docs_df)} unique documents")
+            # Get unique values for dropdowns
+            self.countries_list, self.sectors_list = self._get_unique_values()
+        except Exception as e:
+            print(f"❌ Error loading system: {e}")
+            self.synthesizer = None
+            self.docs_df = pd.DataFrame()
+    def _get_unique_values(self):
+        """Get unique values for dropdowns"""
+        if self.docs_df.empty:
+            return [], []
+        countries_list = []
+        sectors_list = []
+        if 'study_countries' in self.docs_df.columns:
+            for countries_str in self.docs_df['study_countries'].dropna():
+                if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
+                    continue
+                countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
+                filtered = [c for c in countries if c in SPECIFIC_COUNTRIES and len(c) > 1]
+                countries_list.extend(filtered)
+            countries_list = sorted(list(set(countries_list)))
+        if 'world_bank_sector' in self.docs_df.columns:
+            sectors_list = sorted(self.docs_df['world_bank_sector'].dropna().unique().tolist())
+        return countries_list, sectors_list
+    def get_data(self):
+        """Return all data objects"""
+        return {
+            'synthesizer': self.synthesizer,
+            'docs_df': self.docs_df,
+            'countries_list': self.countries_list,
+            'sectors_list': self.sectors_list
+        }

gradio_callbacks.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# =============================================================================
+# gradio_callbacks.py
+# =============================================================================
+import pandas as pd
+from visualisations import (
+    create_world_map, create_interactive_data_explorer,
+    filter_and_analyze, create_pivot_analysis
+)
+class GradioCallbacks:
+    def __init__(self, data_handler):
+        self.data = data_handler.get_data()
+    def get_column_choices(self):
+        """Get available columns for pivot analysis"""
+        if self.data['docs_df'].empty:
+            return [], [], []
+        df = self.data['docs_df']
+        # Define potential categorical columns
+        categorical_cols = []
+        potential_categorical = [
+            'world_bank_sector', 'research_design', 'data_collection_method',
+            'analysis_type', 'study_countries', 'population', 'author_income_group',
+            'has_validation', 'has_randomization', 'has_mixed_methods',
+            'has_advanced_analysis', 'world_bank_subsector', 'topic_summary'
+        ]
+        for col in potential_categorical:
+            if col in df.columns and df[col].notna().sum() > 0:
+                # Check if column has reasonable number of unique values (2-50)
+                unique_count = df[col].nunique()
+                if 2 <= unique_count <= 50:
+                    categorical_cols.append(col)
+        # Define potential numeric columns
+        numeric_cols = []
+        potential_numeric = [
+            'publication_year', 'sample_numeric', 'rigor_score', 'sdg_number',
+            'research_year', 'word_count'
+        ]
+        for col in potential_numeric:
+            if col in df.columns:
+                # Try to convert to numeric and check if we have valid values
+                numeric_values = pd.to_numeric(df[col], errors='coerce')
+                if numeric_values.notna().sum() > 0:
+                    numeric_cols.append(col)
+        all_cols = list(df.columns)
+        return categorical_cols, numeric_cols, all_cols
+    def create_pivot_analysis(self, row_var, col_var, value_var, agg_func):
+        """Create pivot analysis with visualization"""
+        try:
+            from visualisations import create_pivot_analysis
+            if not row_var or not col_var:
+                return None, "Please select both row and column variables."
+            if self.data['docs_df'].empty:
+                return None, "No data available for analysis."
+            # Check if variables exist
+            df = self.data['docs_df']
+            if row_var not in df.columns or col_var not in df.columns:
+                return None, f"Selected variables not found in dataset."
+            # Handle value variable
+            if value_var == "None" or not value_var:
+                value_var = None
+            elif value_var not in df.columns:
+                return None, f"Value variable '{value_var}' not found in dataset."
+            # Create the pivot analysis
+            result = create_pivot_analysis(df, row_var, col_var, value_var, agg_func)
+            if result is None:
+                return None, "Could not create pivot analysis. Check your variable selections."
+            # Handle different return types
+            if isinstance(result, tuple):
+                fig, pivot_df = result
+            else:
+                fig = result
+                pivot_df = None
+            # Create summary text
+            if pivot_df is not None:
+                summary = f"**Pivot Analysis: {row_var} × {col_var}**\n\n"
+                summary += f"- Rows: {len(pivot_df.index)} categories\n"
+                summary += f"- Columns: {len(pivot_df.columns)} categories\n"
+                summary += f"- Aggregation: {agg_func}\n"
+                if value_var:
+                    summary += f"- Value variable: {value_var}\n"
+                # Add top findings
+                if hasattr(pivot_df, 'values'):
+                    total_sum = pivot_df.values.sum()
+                    summary += f"- Total: {total_sum:.0f}\n"
+                    # Find max cell
+                    max_idx = pivot_df.values.argmax()
+                    max_row_idx, max_col_idx = divmod(max_idx, pivot_df.shape[1])
+                    max_row = pivot_df.index[max_row_idx]
+                    max_col = pivot_df.columns[max_col_idx]
+                    max_val = pivot_df.values[max_row_idx, max_col_idx]
+                    summary += f"- Highest value: {max_val:.1f} ({max_row} × {max_col})\n"
+            else:
+                summary = f"Pivot analysis completed for {row_var} × {col_var}"
+            return fig, summary
+        except Exception as e:
+            error_msg = f"Error creating pivot analysis: {str(e)}"
+            print(error_msg)  # For debugging
+            return None, error_msg
+    def create_overview_plots(self):
+        """Create overview plots for the Global Overview tab"""
+        try:
+            import plotly.express as px
+            world_map = create_world_map(self.data['docs_df'])
+            # Simple sector analysis
+            if 'world_bank_sector' in self.data['docs_df'].columns:
+                sector_counts = self.data['docs_df']['world_bank_sector'].value_counts().head(10)
+                sector_plot = px.bar(
+                    x=sector_counts.values,
+                    y=sector_counts.index,
+                    orientation='h',
+                    title="Studies by World Bank Sector",
+                    labels={'x': 'Number of Studies', 'y': 'Sector'}
+                )
+            else:
+                sector_plot = None
+            if 'research_design' in self.data['docs_df'].columns:
+                design_counts = self.data['docs_df']['research_design'].value_counts().head(8)
+                design_plot = px.pie(
+                    values=design_counts.values,
+                    names=design_counts.index,
+                    title="Research Design Distribution"
+                )
+            else:
+                design_plot = None
+            return world_map, sector_plot, design_plot
+        except Exception as e:
+            print(f"Error creating overview plots: {e}")
+            return None, None, None
+    def create_methodology_analysis(self):
+        result = create_interactive_data_explorer(self.data['docs_df'])
+        if isinstance(result, tuple):
+            return result[0]  # Return only the figure
+        else:
+            return result
+    def filter_studies(self, countries, sectors, min_year, max_year, rct_only, min_sample):
+        """Filter studies and return formatted results"""
+        if self.data['docs_df'].empty:
+            return "No data available for filtering."
+        try:
+            filters = {}
+            if countries:
+                filters['countries'] = countries
+            if sectors:
+                filters['sectors'] = sectors
+            if min_year:
+                filters['min_year'] = int(min_year)
+            if max_year:
+                filters['max_year'] = int(max_year)
+            if rct_only:
+                filters['has_rct'] = True
+            if min_sample and min_sample > 0:
+                filters['min_sample_size'] = int(min_sample)
+            filtered_df = filter_and_analyze(self.data['docs_df'], **filters)
+            if filtered_df is None or filtered_df.empty:
+                return "No studies match your filters."
+            # Format results
+            results = []
+            display_cols = ['title', 'authors', 'publication_year', 'study_countries',
+                           'world_bank_sector', 'research_design', 'sample_numeric']
+            for _, row in filtered_df.head(20).iterrows():
+                result_parts = []
+                if 'title' in row:
+                    result_parts.append(f"### {row['title']}")
+                if 'authors' in row:
+                    result_parts.append(f"- **Authors**: {row['authors']}")
+                if 'publication_year' in row:
+                    result_parts.append(f"- **Year**: {row['publication_year']}")
+                if 'study_countries' in row:
+                    result_parts.append(f"- **Countries**: {row['study_countries']}")
+                if 'world_bank_sector' in row:
+                    result_parts.append(f"- **Sector**: {row['world_bank_sector']}")
+                if 'research_design' in row:
+                    result_parts.append(f"- **Design**: {row['research_design']}")
+                if 'sample_numeric' in row and pd.notna(row['sample_numeric']):
+                    result_parts.append(f"- **Sample Size**: {int(row['sample_numeric'])}")
+                if 'has_randomization' in row:
+                    rct_status = 'Yes' if str(row['has_randomization']).lower() in ['true', 'yes', '1'] else 'No'
+                    result_parts.append(f"- **RCT**: {rct_status}")
+                results.append('\n'.join(result_parts))
+            result_text = '\n\n'.join(results)
+            if len(filtered_df) > 20:
+                result_text += f"\n\n*... and {len(filtered_df) - 20} more studies*"
+            return result_text
+        except Exception as e:
+            return f"Error filtering studies: {e}"
+    def answer_question(self, question):
+        """Answer research question with synthesis"""
+        if not self.data['synthesizer']:
+            return "⚠️ System not initialized. Please check configuration.", "", ""
+        if not question.strip():
+            return "Please enter a research question.", "", ""
+        try:
+            result = self.data['synthesizer'].answer_research_question(query=question)
+            # Handle different response types from new system
+            if result['quality'] == 'out_of_scope':
+                suggestions_text = ""
+                if result['suggestions']:
+                    suggestions_text = "\n\n**💡 Try queries like:**\n" + "\n".join([f"• {s}" for s in result['suggestions']])
+                return f"⚠️ {result['answer']}{suggestions_text}", "", ""
+            if result['quality'] == 'no_results':
+                return "No relevant studies found. Try a broader query or different keywords.", "", ""
+            if result['quality'] == 'insufficient':
+                return f"⚠️ {result['answer']}", "", ""
+            if result['study_count'] == 0:
+                return "No relevant studies found. Try a broader query.", "", ""
+            # Format the successful response
+            quality_indicator = {
+                'high': '🟢 High Quality',
+                'moderate': '🟡 Moderate Quality',
+                'low': '🔴 Low Quality'
+            }.get(result['quality'], '')
+            answer = f"## 🔍 Research Synthesis {quality_indicator}\n\n{result['answer']}"
+            references = f"## 📚 References\n\n{result['references']}"
+            # Enhanced stats with quality info
+            stats = f"**Studies analyzed:** {result['study_count']}"
+            if 'quality_message' in result:
+                stats += f"\n**Quality:** {result['quality_message']}"
+            return answer, references, stats
+        except Exception as e:
+            return f"Error processing query: {str(e)}", "", ""

gradio_callbacks_old.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# =============================================================================
+# gradio_callbacks.py
+# =============================================================================
+import pandas as pd
+from visualisations import (
+    create_world_map, create_interactive_data_explorer,
+    filter_and_analyze
+)
+class GradioCallbacks:
+    def __init__(self, data_handler):
+        self.data = data_handler.get_data()
+    def answer_question(self, question, max_studies, min_relevance):
+        """Answer research question with synthesis"""
+        if not self.data['synthesizer']:
+            return "❌ System not initialized. Please check configuration.", "", ""
+        if not question.strip():
+            return "Please enter a research question.", "", ""
+        try:
+            result = self.data['synthesizer'].answer_research_question(
+                query=question,
+                min_studies=2,
+                max_studies=int(max_studies)
+            )
+            if result['study_count'] == 0:
+                return "No relevant studies found. Try a broader query.", "", ""
+            answer = f"## 📝 Research Synthesis\n\n{result['answer']}"
+            references = f"## 📚 References\n\n{result['references']}"
+            stats = f"**Studies analyzed:** {result['study_count']}"
+            return answer, references, stats
+        except Exception as e:
+            return f"Error processing query: {e}", "", ""
+    def create_overview_plots(self):
+        """Create overview plots for the Global Overview tab"""
+        try:
+            import plotly.express as px
+            world_map = create_world_map(self.data['docs_df'])
+            # Simple sector analysis
+            if 'world_bank_sector' in self.data['docs_df'].columns:
+                sector_counts = self.data['docs_df']['world_bank_sector'].value_counts().head(10)
+                sector_plot = px.bar(
+                    x=sector_counts.values,
+                    y=sector_counts.index,
+                    orientation='h',
+                    title="Studies by World Bank Sector",
+                    labels={'x': 'Number of Studies', 'y': 'Sector'}
+                )
+            else:
+                sector_plot = None
+            if 'research_design' in self.data['docs_df'].columns:
+                design_counts = self.data['docs_df']['research_design'].value_counts().head(8)
+                design_plot = px.pie(
+                    values=design_counts.values,
+                    names=design_counts.index,
+                    title="Research Design Distribution"
+                )
+            else:
+                design_plot = None
+            return world_map, sector_plot, design_plot
+        except Exception as e:
+            print(f"Error creating overview plots: {e}")
+            return None, None, None
+    def create_methodology_analysis(self):
+        result = create_interactive_data_explorer(self.data['docs_df'])
+        if isinstance(result, tuple):
+            return result[0]  # Return only the figure
+        else:
+            return result
+    def filter_studies(self, countries, sectors, min_year, max_year, rct_only, min_sample):
+        """Filter studies and return formatted results"""
+        if self.data['docs_df'].empty:
+            return "No data available for filtering."
+        try:
+            filters = {}
+            if countries:
+                filters['countries'] = countries
+            if sectors:
+                filters['sectors'] = sectors
+            if min_year:
+                filters['min_year'] = int(min_year)
+            if max_year:
+                filters['max_year'] = int(max_year)
+            if rct_only:
+                filters['has_rct'] = True
+            if min_sample and min_sample > 0:
+                filters['min_sample_size'] = int(min_sample)
+            filtered_df = filter_and_analyze(self.data['docs_df'], **filters)
+            if filtered_df is None or filtered_df.empty:
+                return "No studies match your filters."
+            # Format results
+            results = []
+            display_cols = ['title', 'authors', 'publication_year', 'study_countries',
+                           'world_bank_sector', 'research_design', 'sample_numeric']
+            for _, row in filtered_df.head(20).iterrows():
+                result_parts = []
+                if 'title' in row:
+                    result_parts.append(f"### {row['title']}")
+                if 'authors' in row:
+                    result_parts.append(f"- **Authors**: {row['authors']}")
+                if 'publication_year' in row:
+                    result_parts.append(f"- **Year**: {row['publication_year']}")
+                if 'study_countries' in row:
+                    result_parts.append(f"- **Countries**: {row['study_countries']}")
+                if 'world_bank_sector' in row:
+                    result_parts.append(f"- **Sector**: {row['world_bank_sector']}")
+                if 'research_design' in row:
+                    result_parts.append(f"- **Design**: {row['research_design']}")
+                if 'sample_numeric' in row and pd.notna(row['sample_numeric']):
+                    result_parts.append(f"- **Sample Size**: {int(row['sample_numeric'])}")
+                if 'has_randomization' in row:
+                    rct_status = 'Yes' if str(row['has_randomization']).lower() in ['true', 'yes', '1'] else 'No'
+                    result_parts.append(f"- **RCT**: {rct_status}")
+                results.append('\n'.join(result_parts))
+            result_text = '\n\n'.join(results)
+            if len(filtered_df) > 20:
+                result_text += f"\n\n*... and {len(filtered_df) - 20} more studies*"
+            return result_text
+        except Exception as e:
+            return f"Error filtering studies: {e}"

gradio_components.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# =============================================================================
+# gradio_components.py
+# =============================================================================
+import gradio as gr
+from config import APP_TITLE, DEFAULT_MAX_STUDIES, DEFAULT_MIN_RELEVANCE, DEFAULT_MIN_YEAR, DEFAULT_MAX_YEAR
+def create_overview_tab(callbacks):
+    """Create the Global Overview tab"""
+    with gr.Tab("🌍 Global Overview", id="overview"):
+        gr.Markdown("## Research Landscape Analysis")
+        with gr.Row():
+            world_map_plot = gr.Plot(label="Global Research Distribution")
+        with gr.Row():
+            sector_plot = gr.Plot(label="Sector Analysis")
+            design_plot = gr.Plot(label="Research Designs")
+        overview_btn = gr.Button("🔄 Load Overview", variant="secondary")
+        overview_btn.click(
+            callbacks.create_overview_plots,
+            outputs=[world_map_plot, sector_plot, design_plot]
+        )
+def create_methodology_tab(callbacks):
+    """Create the Methodology Dashboard tab"""
+    with gr.Tab("📊 Methodology Dashboard", id="methods"):
+        gr.Markdown("## Deep Dive into Research Methods & Quality")
+        methodology_plot = gr.Plot(label="Data Completeness")
+        methodology_btn = gr.Button("🔄 Load Analysis", variant="secondary")
+        methodology_btn.click(
+            callbacks.create_methodology_analysis,
+            outputs=[methodology_plot]
+        )
+def create_filters_tab(callbacks, countries_list, sectors_list):
+    """Create the Advanced Search tab"""
+    with gr.Tab("🔍 Advanced Search", id="filters"):
+        gr.Markdown("## Filter and Explore Studies")
+        with gr.Row():
+            with gr.Column():
+                country_filter = gr.Dropdown(
+                    choices=countries_list,
+                    label="Countries",
+                    multiselect=True,
+                    interactive=True
+                )
+                sector_filter = gr.Dropdown(
+                    choices=sectors_list,
+                    label="World Bank Sectors",
+                    multiselect=True,
+                    interactive=True
+                )
+            with gr.Column():
+                min_year_filter = gr.Number(
+                    label="Minimum Publication Year",
+                    value=DEFAULT_MIN_YEAR,
+                    precision=0
+                )
+                max_year_filter = gr.Number(
+                    label="Maximum Publication Year",
+                    value=DEFAULT_MAX_YEAR,
+                    precision=0
+                )
+            with gr.Column():
+                rct_filter = gr.Checkbox(
+                    label="Only Randomized Controlled Trials",
+                    value=False
+                )
+                min_sample_filter = gr.Number(
+                    label="Minimum Sample Size",
+                    value=None,
+                    precision=0
+                )
+        filter_btn = gr.Button("🔍 Apply Filters", variant="primary")
+        filtered_results = gr.Markdown(label="Filtered Studies")
+        filter_btn.click(
+            callbacks.filter_studies,
+            inputs=[country_filter, sector_filter, min_year_filter,
+                   max_year_filter, rct_filter, min_sample_filter],
+            outputs=filtered_results
+        )
+def create_about_tab():
+    """Create the About tab"""
+    with gr.Tab("ℹ️ About", id="about"):
+        gr.Markdown("""
+        ## About This Research Q&A System
+        This system provides intelligent synthesis across thousands of research documents using:
+        ### 🤖 **AI-Powered Analysis**
+        - **Semantic Search**: Uses Google's Gemini embeddings to find relevant studies
+        - **Smart Synthesis**: Combines findings across multiple studies with proper citations
+        - **Rich Metadata**: Leverages 35+ metadata fields per document
+        ### 📊 **Research Quality Metrics**
+        - **Rigor Scores**: Methodological quality assessment
+        - **Study Design**: RCTs, observational studies, mixed methods
+        - **Validation Status**: Peer review and replication information
+        ### 🌍 **Global Coverage**
+        - Studies from fragile and conflict-affected states
+        - Multiple World Bank sectors and regions
+        - Comprehensive geographic and temporal coverage
+        ### 🔬 **Advanced Features**
+        - Interactive visualizations and maps
+        - Advanced filtering and search capabilities
+        - Academic-style citations with full context
+        ---
+        **Data Sources**: Research documents from development economics, impact evaluation, and policy studies
+        **Technology**: Built with Gradio, FAISS, Google Gemini AI, and Plotly
+        """)
+def create_pivot_tab(callbacks):
+    """Create the Pivot Analysis tab"""
+    with gr.Tab("📊 Pivot Analysis", id="pivot"):
+        gr.Markdown("## Interactive Pivot Table Analysis")
+        gr.Markdown("Create cross-tabulations and pivot tables to explore relationships in the data")
+        with gr.Row():
+            with gr.Column():
+                # Get column choices
+                categorical_cols, numeric_cols, all_cols = callbacks.get_column_choices()
+                row_var = gr.Dropdown(
+                    choices=categorical_cols,
+                    label="Row Variable",
+                    value=categorical_cols[0] if categorical_cols else None,
+                    interactive=True
+                )
+                col_var = gr.Dropdown(
+                    choices=categorical_cols,
+                    label="Column Variable",
+                    value=categorical_cols[1] if len(categorical_cols) > 1 else None,
+                    interactive=True
+                )
+                value_var = gr.Dropdown(
+                    choices=[None] + numeric_cols,
+                    label="Value Variable (optional - for numeric aggregation)",
+                    value=None,
+                    interactive=True
+                )
+                agg_func = gr.Dropdown(
+                    choices=["count", "mean", "sum"],
+                    label="Aggregation Function",
+                    value="count",
+                    interactive=True
+                )
+                pivot_btn = gr.Button("🔄 Create Pivot Analysis", variant="primary")
+            with gr.Column():
+                gr.Markdown("### 💡 Suggested Analyses")
+                gr.Markdown("""
+                **Popular combinations:**
+                - Research Design × World Bank Sector (count)
+                - Countries × Has Randomization (count)
+                - Author Income Group × Data Collection Method (count)
+                - Research Design × Rigor Score (mean)
+                - World Bank Sector × Sample Size (mean)
+                **Tips:**
+                - Use 'count' to see frequency distributions
+                - Use 'mean' or 'sum' with numeric value variables
+                - Choose variables with reasonable number of categories
+                """)
+        with gr.Row():
+            pivot_plot = gr.Plot(label="Pivot Heatmap")
+        with gr.Row():
+            pivot_summary = gr.Markdown(label="Pivot Table Summary")
+        pivot_btn.click(
+            callbacks.create_pivot_analysis,
+            inputs=[row_var, col_var, value_var, agg_func],
+            outputs=[pivot_plot, pivot_summary]
+        )
+def create_qa_tab(callbacks):
+    """Create the Q&A tab"""
+    with gr.Tab("🤖 Ask Questions", id="qa"):
+        gr.Markdown("## Ask questions about research methods, findings, or approaches")
+        with gr.Row():
+            with gr.Column(scale=2):
+                question_input = gr.Textbox(
+                    label="Research Question",
+                    placeholder="e.g., 'What sampling strategies work best in conflict-affected areas?'",
+                    lines=3
+                )
+                submit_btn = gr.Button("🔍 Search & Synthesize", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                gr.Markdown("### 💡 Example Methodological Questions")
+                gr.Markdown("""
+                - What sampling strategies work best in conflict-affected areas?
+                - How do researchers ensure data quality during active conflict?
+                - What are the ethical considerations for RCTs in fragile states?
+                - How do researchers adapt survey instruments for low-literacy populations?
+                - What methods are used to track mobile populations in FCAS?
+                - How do studies address attrition bias in longitudinal FCAS research?
+                - What proxy measures are used when direct measurement is impossible?
+                - How do researchers validate self-reported data in conflict settings?
+                - What approaches work for establishing counterfactuals in FCAS?
+                - How do studies handle missing data due to displacement or conflict?
+                """)
+        # Results
+        with gr.Row():
+            answer_output = gr.Markdown(label="Synthesis")
+        with gr.Row():
+            references_output = gr.Markdown(label="References")
+        stats_output = gr.Markdown(label="Statistics")
+        submit_btn.click(
+            callbacks.answer_question,
+            inputs=[question_input],
+            outputs=[answer_output, references_output, stats_output]
+        )

gradio_components_old.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# =============================================================================
+# gradio_components.py
+# =============================================================================
+import gradio as gr
+from config import APP_TITLE, APP_DESCRIPTION, DEFAULT_MAX_STUDIES, DEFAULT_MIN_RELEVANCE, DEFAULT_MIN_YEAR, DEFAULT_MAX_YEAR
+def create_header():
+    """Create the app header"""
+    return gr.HTML(f"""
+    <div class="main-header">
+        <h1>{APP_TITLE}</h1>
+        <p>{APP_DESCRIPTION}</p>
+    </div>
+    """)
+def create_qa_tab(callbacks):
+    """Create the Q&A tab"""
+    with gr.Tab("🤖 Ask Questions", id="qa"):
+        gr.Markdown("## Ask questions about research methods, findings, or approaches")
+        with gr.Row():
+            with gr.Column(scale=2):
+                question_input = gr.Textbox(
+                    label="Research Question",
+                    placeholder="e.g., 'What methods were used in agricultural research in Yemen?'",
+                    lines=3
+                )
+                with gr.Row():
+                    max_studies = gr.Slider(
+                        label="Max Studies to Analyze",
+                        minimum=3,
+                        maximum=10,
+                        value=DEFAULT_MAX_STUDIES,
+                        step=1
+                    )
+                    min_relevance = gr.Slider(
+                        label="Minimum Relevance Score",
+                        minimum=0.5,
+                        maximum=0.9,
+                        value=DEFAULT_MIN_RELEVANCE,
+                        step=0.05
+                    )
+                submit_btn = gr.Button("🔍 Search & Synthesize", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                gr.Markdown("### 💡 Example Questions")
+                gr.Markdown("""
+                - What methods were used in agricultural research in Yemen?
+                - How do cash transfer programs impact poverty reduction?
+                - What are the main challenges in education programs in fragile states?
+                - What evaluation approaches are used for health interventions?
+                - Which countries have the most rigorous impact evaluations?
+                """)
+        # Results
+        with gr.Row():
+            answer_output = gr.Markdown(label="Synthesis")
+        with gr.Row():
+            references_output = gr.Markdown(label="References")
+        stats_output = gr.Markdown(label="Statistics")
+        submit_btn.click(
+            callbacks.answer_question,
+            inputs=[question_input, max_studies, min_relevance],
+            outputs=[answer_output, references_output, stats_output]
+        )
+def create_overview_tab(callbacks):
+    """Create the Global Overview tab"""
+    with gr.Tab("🌍 Global Overview", id="overview"):
+        gr.Markdown("## Research Landscape Analysis")
+        with gr.Row():
+            world_map_plot = gr.Plot(label="Global Research Distribution")
+        with gr.Row():
+            sector_plot = gr.Plot(label="Sector Analysis")
+            design_plot = gr.Plot(label="Research Designs")
+        overview_btn = gr.Button("🔄 Load Overview", variant="secondary")
+        overview_btn.click(
+            callbacks.create_overview_plots,
+            outputs=[world_map_plot, sector_plot, design_plot]
+        )
+def create_methodology_tab(callbacks):
+    """Create the Methodology Dashboard tab"""
+    with gr.Tab("📊 Methodology Dashboard", id="methods"):
+        gr.Markdown("## Deep Dive into Research Methods & Quality")
+        methodology_plot = gr.Plot(label="Data Completeness")
+        methodology_btn = gr.Button("🔄 Load Analysis", variant="secondary")
+        methodology_btn.click(
+            callbacks.create_methodology_analysis,
+            outputs=[methodology_plot]
+        )
+def create_filters_tab(callbacks, countries_list, sectors_list):
+    """Create the Advanced Search tab"""
+    with gr.Tab("🔍 Advanced Search", id="filters"):
+        gr.Markdown("## Filter and Explore Studies")
+        with gr.Row():
+            with gr.Column():
+                country_filter = gr.Dropdown(
+                    choices=countries_list,
+                    label="Countries",
+                    multiselect=True,
+                    interactive=True
+                )
+                sector_filter = gr.Dropdown(
+                    choices=sectors_list,
+                    label="World Bank Sectors",
+                    multiselect=True,
+                    interactive=True
+                )
+            with gr.Column():
+                min_year_filter = gr.Number(
+                    label="Minimum Publication Year",
+                    value=DEFAULT_MIN_YEAR,
+                    precision=0
+                )
+                max_year_filter = gr.Number(
+                    label="Maximum Publication Year",
+                    value=DEFAULT_MAX_YEAR,
+                    precision=0
+                )
+            with gr.Column():
+                rct_filter = gr.Checkbox(
+                    label="Only Randomized Controlled Trials",
+                    value=False
+                )
+                min_sample_filter = gr.Number(
+                    label="Minimum Sample Size",
+                    value=None,
+                    precision=0
+                )
+        filter_btn = gr.Button("🔍 Apply Filters", variant="primary")
+        filtered_results = gr.Markdown(label="Filtered Studies")
+        filter_btn.click(
+            callbacks.filter_studies,
+            inputs=[country_filter, sector_filter, min_year_filter,
+                   max_year_filter, rct_filter, min_sample_filter],
+            outputs=filtered_results
+        )
+def create_about_tab():
+    """Create the About tab"""
+    with gr.Tab("ℹ️ About", id="about"):
+        gr.Markdown("""
+        ## About This Research Q&A System
+        This system provides intelligent synthesis across thousands of research documents using:
+        ### 🤖 **AI-Powered Analysis**
+        - **Semantic Search**: Uses Google's Gemini embeddings to find relevant studies
+        - **Smart Synthesis**: Combines findings across multiple studies with proper citations
+        - **Rich Metadata**: Leverages 35+ metadata fields per document
+        ### 📊 **Research Quality Metrics**
+        - **Rigor Scores**: Methodological quality assessment
+        - **Study Design**: RCTs, observational studies, mixed methods
+        - **Validation Status**: Peer review and replication information
+        ### 🌍 **Global Coverage**
+        - Studies from fragile and conflict-affected states
+        - Multiple World Bank sectors and regions
+        - Comprehensive geographic and temporal coverage
+        ### 🔬 **Advanced Features**
+        - Interactive visualizations and maps
+        - Advanced filtering and search capabilities
+        - Academic-style citations with full context
+        ---
+        **Data Sources**: Research documents from development economics, impact evaluation, and policy studies
+        **Technology**: Built with Gradio, FAISS, Google Gemini AI, and Plotly
+        """)
+def create_pivot_tab(callbacks):
+    """Create the Pivot Analysis tab"""
+    with gr.Tab("📊 Pivot Analysis", id="pivot"):
+        gr.Markdown("## Interactive Pivot Table Analysis")
+        gr.Markdown("Create cross-tabulations and pivot tables to explore relationships in the data")
+        with gr.Row():
+            with gr.Column():
+                # Get column choices
+                categorical_cols, numeric_cols, all_cols = callbacks.get_column_choices()
+                row_var = gr.Dropdown(
+                    choices=categorical_cols,
+                    label="Row Variable",
+                    value=categorical_cols[0] if categorical_cols else None,
+                    interactive=True
+                )
+                col_var = gr.Dropdown(
+                    choices=categorical_cols,
+                    label="Column Variable",
+                    value=categorical_cols[1] if len(categorical_cols) > 1 else None,
+                    interactive=True
+                )
+                value_var = gr.Dropdown(
+                    choices=[None] + numeric_cols,
+                    label="Value Variable (optional - for numeric aggregation)",
+                    value=None,
+                    interactive=True
+                )
+                agg_func = gr.Dropdown(
+                    choices=["count", "mean", "sum"],
+                    label="Aggregation Function",
+                    value="count",
+                    interactive=True
+                )
+                pivot_btn = gr.Button("🔄 Create Pivot Analysis", variant="primary")
+            with gr.Column():
+                gr.Markdown("### 💡 Suggested Analyses")
+                gr.Markdown("""
+                **Popular combinations:**
+                - Research Design × World Bank Sector (count)
+                - Countries × Has Randomization (count)
+                - Author Income Group × Data Collection Method (count)
+                - Research Design × Rigor Score (mean)
+                - World Bank Sector × Sample Size (mean)
+                **Tips:**
+                - Use 'count' to see frequency distributions
+                - Use 'mean' or 'sum' with numeric value variables
+                - Choose variables with reasonable number of categories
+                """)
+        with gr.Row():
+            pivot_plot = gr.Plot(label="Pivot Heatmap")
+        with gr.Row():
+            pivot_summary = gr.Markdown(label="Pivot Table Summary")
+        pivot_btn.click(
+            callbacks.create_pivot_analysis,
+            inputs=[row_var, col_var, value_var, agg_func],
+            outputs=[pivot_plot, pivot_summary]
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.44.1
+pandas>=1.5.0
+numpy>=1.21.0
+plotly>=5.15.0
+folium>=0.14.0
+faiss-cpu>=1.7.4
+google-generativeai>=0.3.0
+scikit-learn>=1.3.0
+tabulate>=0.9.0

research_chunks.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82a1b861e56617f09458ac02fc58e5d70927f888b34af515043c1e047cfa644b
+size 65310765

synthesis_qa_backend.py ADDED Viewed

	@@ -0,0 +1,715 @@

+import faiss
+import pandas as pd
+import numpy as np
+import google.generativeai as genai
+from typing import List, Dict, Optional, Tuple
+from collections import defaultdict
+import logging
+import time
+from dataclasses import dataclass
+@dataclass
+class SynthesisConfig:
+    """Configuration class for research synthesis parameters"""
+    top_k: int = 20
+    min_relevance_strict: float = 0.7
+    min_relevance_moderate: float = 0.6
+    min_relevance_threshold: float = 0.55
+    max_studies: int = 6
+    min_studies: int = 4
+    max_synthesis_tokens: int = 4000
+    rate_limit_delay: float = 1.0
+    domain_keywords: List[str] = None
+    def __post_init__(self):
+        if self.domain_keywords is None:
+            self.domain_keywords = [
+                'development', 'health', 'education', 'governance', 'poverty',
+                'conflict', 'fragile', 'intervention', 'policy', 'evaluation',
+                'impact', 'program', 'research', 'study', 'analysis', 'survey'
+            ]
+class QueryAnalyzer:
+    """Analyzes queries to determine relevance to research domain"""
+    def __init__(self, config: SynthesisConfig):
+        self.config = config
+    def is_domain_relevant(self, query: str) -> Tuple[bool, float, str]:
+        """
+        Check if query is relevant to research domain
+        Returns: (is_relevant, confidence_score, reason)
+        """
+        query_lower = query.lower()
+        # Check for obvious non-research queries
+        non_research_patterns = [
+            'who won', 'world cup', 'sports', 'entertainment', 'celebrity',
+            'weather', 'stock price', 'cryptocurrency', 'movie', 'music',
+            'recipe', 'cooking', 'fashion', 'shopping', 'games', 'gaming'
+        ]
+        for pattern in non_research_patterns:
+            if pattern in query_lower:
+                return False, 0.1, f"Query contains non-research pattern: '{pattern}'"
+        # Check for domain relevance - multiple approaches
+        domain_matches = sum(1 for keyword in self.config.domain_keywords
+                           if keyword in query_lower)
+        # Research question patterns (even without domain keywords)
+        research_patterns = [
+            'what methods', 'what approaches', 'how do', 'how to',
+            'what strategies', 'what techniques', 'how can',
+            'what are the', 'which methods', 'which approaches'
+        ]
+        research_pattern_matches = sum(1 for pattern in research_patterns
+                                     if pattern in query_lower)
+        # Methodological terms that indicate research focus
+        method_terms = [
+            'method', 'approach', 'strategy', 'technique', 'measure',
+            'measurement', 'data', 'sample', 'study', 'research',
+            'analysis', 'evaluation', 'assessment', 'design'
+        ]
+        method_matches = sum(1 for term in method_terms if term in query_lower)
+        # Calculate total relevance score
+        total_score = domain_matches + (research_pattern_matches * 2) + method_matches
+        if total_score == 0:
+            return False, 0.3, "No domain-relevant keywords or research patterns found"
+        # Be more generous for methodological queries
+        if research_pattern_matches > 0 or method_matches >= 2:
+            confidence = min(0.9, 0.6 + (total_score * 0.05))
+            return True, confidence, f"Found research patterns and methodological terms (score: {total_score})"
+        if domain_matches > 0:
+            confidence = min(0.9, 0.5 + (domain_matches * 0.1))
+            return True, confidence, f"Found {domain_matches} domain-relevant keywords"
+        return False, 0.3, "Insufficient domain relevance"
+    def analyze_query_type(self, query: str) -> Dict[str, str]:
+        """Analyze query to determine focus area and type"""
+        query_lower = query.lower()
+        focus_area = "general findings"
+        query_type = "exploratory"
+        # Determine focus area
+        if any(word in query_lower for word in ['method', 'approach', 'methodology', 'technique', 'design']):
+            focus_area = "methodological approaches"
+            query_type = "methodological"
+        elif any(word in query_lower for word in ['result', 'finding', 'outcome', 'impact', 'effect', 'evaluation']):
+            focus_area = "key findings and outcomes"
+            query_type = "results-focused"
+        elif any(word in query_lower for word in ['challenge', 'barrier', 'problem', 'issue', 'difficulty']):
+            focus_area = "challenges and barriers"
+            query_type = "problem-identification"
+        elif any(word in query_lower for word in ['recommendation', 'solution', 'strategy', 'intervention', 'policy']):
+            focus_area = "strategies and recommendations"
+            query_type = "solution-oriented"
+        elif any(word in query_lower for word in ['what', 'how', 'why', 'which', 'where']):
+            query_type = "analytical"
+        # Additional FCAS-specific analysis
+        if any(word in query_lower for word in ['sampling', 'sample', 'recruitment', 'selection']):
+            focus_area = "sampling and recruitment strategies"
+            query_type = "methodological"
+        elif any(word in query_lower for word in ['data quality', 'validation', 'reliability', 'validity']):
+            focus_area = "data quality and validation"
+            query_type = "methodological"
+        elif any(word in query_lower for word in ['ethical', 'ethics', 'consent', 'protection']):
+            focus_area = "ethical considerations"
+            query_type = "methodological"
+        elif any(word in query_lower for word in ['tracking', 'mobile', 'displacement', 'attrition']):
+            focus_area = "population tracking and attrition"
+            query_type = "methodological"
+        elif any(word in query_lower for word in ['proxy', 'indicator', 'measurement', 'counterfactual']):
+            focus_area = "measurement and identification strategies"
+            query_type = "methodological"
+        return {
+            'focus_area': focus_area,
+            'query_type': query_type,
+            'original_query': query
+        }
+class ResearchSynthesizer:
+    def __init__(self, index_path: str, metadata_path: str, api_key: str,
+                 config: Optional[SynthesisConfig] = None,
+                 log_level: int = logging.INFO):
+        """Initialize the research synthesis system"""
+        # Setup logging
+        logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
+        self.logger = logging.getLogger(__name__)
+        # Configuration
+        self.config = config or SynthesisConfig()
+        self.query_analyzer = QueryAnalyzer(self.config)
+        # Validate inputs
+        self._validate_inputs(index_path, metadata_path, api_key)
+        try:
+            # Load FAISS index and metadata
+            self.index = faiss.read_index(index_path)
+            self.metadata = pd.read_csv(metadata_path)
+            # Configure Gemini API
+            genai.configure(api_key=api_key)
+            self.logger.info(f"Loaded {self.index.ntotal} chunks from {len(self.metadata['record_id'].unique())} documents")
+            self.logger.info(f"FAISS index dimensions: {self.index.d}")
+            # Check dimension compatibility
+            self._check_dimensions()
+        except Exception as e:
+            self.logger.error(f"Failed to initialize synthesizer: {e}")
+            raise
+    def _validate_inputs(self, index_path: str, metadata_path: str, api_key: str):
+        """Validate input parameters"""
+        if not index_path or not metadata_path:
+            raise ValueError("Index path and metadata path must be provided")
+        if not api_key or api_key == "your_api_key_here":
+            raise ValueError("Valid API key must be provided")
+        if self.config.min_relevance_strict < self.config.min_relevance_moderate:
+            raise ValueError("Strict relevance threshold must be >= moderate threshold")
+    def _check_dimensions(self):
+        """Check embedding dimension compatibility"""
+        test_embedding = self._create_test_embedding()
+        if test_embedding is not None:
+            embedding_dim = test_embedding.shape[1]
+            index_dim = self.index.d
+            self.logger.info(f"Gemini embedding dimensions: {embedding_dim}")
+            if embedding_dim != index_dim:
+                self.logger.warning(f"DIMENSION MISMATCH: Gemini={embedding_dim}, FAISS={index_dim}")
+                self.logger.info("Will apply dimension adjustment during search")
+                self.dimension_mismatch = True
+                self.target_dim = index_dim
+                self.source_dim = embedding_dim
+            else:
+                self.logger.info("Dimensions match perfectly")
+                self.dimension_mismatch = False
+        else:
+            self.logger.warning("Could not test embedding dimensions")
+            self.dimension_mismatch = False
+    def _create_test_embedding(self) -> Optional[np.ndarray]:
+        """Create a test embedding to check dimensions"""
+        try:
+            time.sleep(self.config.rate_limit_delay)  # Rate limiting
+            embed_result = genai.embed_content(
+                model="models/gemini-embedding-001",
+                content="test",
+                task_type="retrieval_query"
+            )
+            return np.array([embed_result['embedding']], dtype="float32")
+        except Exception as e:
+            self.logger.error(f"Could not create test embedding: {e}")
+            return None
+    def _adjust_embedding_dimensions(self, embedding: np.ndarray) -> np.ndarray:
+        """Adjust embedding dimensions to match FAISS index"""
+        if not self.dimension_mismatch:
+            return embedding
+        current_dim = embedding.shape[1]
+        target_dim = self.target_dim
+        self.logger.debug(f"Adjusting dimensions: {current_dim} → {target_dim}")
+        if current_dim < target_dim:
+            # Pad with zeros
+            padding = np.zeros((embedding.shape[0], target_dim - current_dim), dtype="float32")
+            adjusted = np.concatenate([embedding, padding], axis=1)
+        elif current_dim > target_dim:
+            # Truncate (consider PCA for better semantic preservation)
+            adjusted = embedding[:, :target_dim]
+        else:
+            adjusted = embedding
+        return adjusted
+    def search_relevant_chunks(self, query: str) -> List[Dict]:
+        """Find relevant chunks using FAISS index and Gemini embeddings API"""
+        self.logger.info(f"Searching for: '{query}'")
+        try:
+            time.sleep(self.config.rate_limit_delay)  # Rate limiting
+            embed_result = genai.embed_content(
+                model="models/gemini-embedding-001",
+                content=query,
+                task_type="retrieval_query"
+            )
+            query_embedding = np.array([embed_result['embedding']], dtype="float32")
+            self.logger.debug(f"Embedding created: shape {query_embedding.shape}")
+        except Exception as e:
+            self.logger.error(f"Embedding creation failed: {e}")
+            return []
+        # Adjust dimensions if needed
+        query_embedding = self._adjust_embedding_dimensions(query_embedding)
+        try:
+            distances, indices = self.index.search(query_embedding, self.config.top_k)
+            self.logger.info(f"Search completed - found {len(indices[0])} results")
+            self.logger.debug(f"Distance range: {distances[0].min():.4f} to {distances[0].max():.4f}")
+        except Exception as e:
+            self.logger.error(f"FAISS search failed: {e}")
+            return []
+        results = []
+        for distance, idx in zip(distances[0], indices[0]):
+            if idx == -1 or idx >= len(self.metadata):
+                continue
+            try:
+                chunk_data = self.metadata.iloc[idx].to_dict()
+                chunk_data['similarity_score'] = float(1 / (1 + distance))
+                chunk_data['faiss_distance'] = float(distance)
+                chunk_data['faiss_index'] = int(idx)
+                results.append(chunk_data)
+            except (IndexError, KeyError) as e:
+                self.logger.warning(f"Invalid index {idx}, skipping: {e}")
+                continue
+        # Sort by similarity score
+        results.sort(key=lambda x: x['similarity_score'], reverse=True)
+        if results:
+            best_score = results[0]['similarity_score']
+            worst_score = results[-1]['similarity_score']
+            self.logger.info(f"Similarity range: {worst_score:.4f} to {best_score:.4f}")
+        return results
+    def group_by_studies(self, chunks: List[Dict]) -> Dict[str, List[Dict]]:
+        """Group chunks by study/document"""
+        studies = defaultdict(list)
+        for chunk in chunks:
+            studies[chunk['record_id']].append(chunk)
+        return dict(studies)
+    def filter_and_rank_studies(self, studies: Dict[str, List[Dict]],
+                               query: str = "") -> Tuple[List[Dict], str]:
+        """
+        Select the most relevant studies using adaptive thresholds
+        Returns: (selected_studies, quality_message)
+        """
+        study_summaries = []
+        # Determine threshold based on best available scores
+        all_best_scores = []
+        for record_id, chunks in studies.items():
+            best_chunk = max(chunks, key=lambda x: x['similarity_score'])
+            all_best_scores.append(best_chunk['similarity_score'])
+        if not all_best_scores:
+            return [], "No studies found"
+        max_score = max(all_best_scores)
+        avg_score = np.mean(all_best_scores)
+        # Adaptive threshold selection
+        if max_score >= self.config.min_relevance_strict:
+            threshold = self.config.min_relevance_strict
+            quality = "high"
+        elif max_score >= self.config.min_relevance_moderate:
+            threshold = self.config.min_relevance_moderate
+            quality = "moderate"
+        elif max_score >= self.config.min_relevance_threshold:
+            threshold = self.config.min_relevance_threshold
+            quality = "low"
+        else:
+            return [], f"No studies met minimum relevance threshold. Best score: {max_score:.3f}"
+        self.logger.info(f"Using {quality} quality threshold: {threshold:.3f}")
+        # Filter studies
+        for record_id, chunks in studies.items():
+            best_chunk = max(chunks, key=lambda x: x['similarity_score'])
+            if best_chunk['similarity_score'] < threshold:
+                continue
+            # Get relevant chunks with slightly lower threshold
+            relevant_chunks = [c for c in chunks
+                             if c['similarity_score'] > threshold * 0.8]
+            # Limit text to prevent token overflow
+            combined_texts = [c['text'] for c in relevant_chunks[:3]]
+            combined_text = "\n\n".join(combined_texts)
+            # Truncate if too long
+            if len(combined_text) > 1500:
+                combined_text = combined_text[:1500] + "..."
+            study_summary = {
+                'record_id': record_id,
+                'combined_text': combined_text,
+                'max_relevance': best_chunk['similarity_score'],
+                'chunk_count': len(relevant_chunks)
+            }
+            # Copy metadata (excluding internal fields)
+            excluded_fields = {
+                'record_id', 'full_text', 'text', 'chunk_id', 'section',
+                'chunk_type', 'word_count', 'faiss_distance', 'faiss_index'
+            }
+            for key, value in best_chunk.items():
+                if key not in excluded_fields and not key.startswith('similarity'):
+                    study_summary[key] = value
+            study_summaries.append(study_summary)
+        # Enhanced scoring with precomputed metadata relevance
+        def enhanced_score(study):
+            base_score = study['max_relevance']
+            # Metadata relevance boost (cached)
+            metadata_boost = self._calculate_metadata_boost(study, query)
+            # Quality indicators boost
+            quality_boost = self._calculate_quality_boost(study)
+            return base_score + metadata_boost + quality_boost
+        study_summaries.sort(key=enhanced_score, reverse=True)
+        selected_studies = study_summaries[:self.config.max_studies]
+        quality_message = f"Selected {len(selected_studies)} studies with {quality} relevance (threshold: {threshold:.3f})"
+        self.logger.info(quality_message)
+        for i, study in enumerate(selected_studies, 1):
+            title = study.get('title', 'No title')[:50]
+            score = enhanced_score(study)
+            self.logger.debug(f"  {i}. Score: {score:.4f} - {title}...")
+        return selected_studies, quality_message
+    def _calculate_metadata_boost(self, study: Dict, query: str) -> float:
+        """Calculate metadata relevance boost for a study"""
+        query_lower = query.lower()
+        metadata_boost = 0
+        boost_fields = [
+            'world_bank_sector', 'world_bank_subsector', 'study_countries',
+            'population', 'data_collection_method', 'analysis_type',
+            'research_design', 'topic_summary', 'countries_list'
+        ]
+        for field in boost_fields:
+            if field in study and study[field]:
+                field_value = str(study[field]).lower()
+                matches = sum(1 for word in query_lower.split() if word in field_value)
+                metadata_boost += matches * 0.05  # Smaller, more controlled boost
+        return min(metadata_boost, 0.2)  # Cap the boost
+    def _calculate_quality_boost(self, study: Dict) -> float:
+        """Calculate quality indicator boost for a study"""
+        quality_boost = 0
+        # Boolean quality indicators
+        bool_indicators = {
+            'has_randomization': 0.08,
+            'has_validation': 0.05,
+            'has_advanced_analysis': 0.03,
+            'has_mixed_methods': 0.03
+        }
+        for field, boost in bool_indicators.items():
+            if study.get(field) == 'true':
+                quality_boost += boost
+        # Numeric quality indicators
+        try:
+            rigor_score = float(study.get('rigor_score', 0))
+            quality_boost += min(rigor_score * 0.02, 0.1)  # Cap at 0.1
+        except (ValueError, TypeError):
+            pass
+        return quality_boost
+    def create_synthesis(self, query: str, studies: List[Dict],
+                        query_analysis: Dict) -> str:
+        """Create synthesized answer with improved prompt engineering"""
+        # Build concise context
+        studies_context = self._build_studies_context(studies)
+        # Determine synthesis length based on study count
+        if len(studies) <= 3:
+            synthesis_style = "concise"
+            max_length = "2-3 paragraphs"
+        elif len(studies) <= 6:
+            synthesis_style = "balanced"
+            max_length = "3-4 paragraphs with clear sections"
+        else:
+            synthesis_style = "comprehensive"
+            max_length = "4-5 paragraphs with detailed analysis"
+        synthesis_prompt = f"""You are an expert research synthesizer analyzing studies from fragile and conflict-affected settings (FCAS).
+USER QUERY: "{query}"
+QUERY TYPE: {query_analysis['query_type']}
+FOCUS AREA: {query_analysis['focus_area']}
+STUDIES TO SYNTHESIZE ({len(studies)} studies):
+{studies_context}
+SYNTHESIS INSTRUCTIONS:
+1. **Direct Answer First**: Start with a clear, direct answer to the user's question
+2. **Evidence-Based**: Ground all claims in the provided studies with citations (Author, Year)
+3. **{synthesis_style.title()} Analysis**: Write {max_length}
+4. **Key Focus**: Emphasize {query_analysis['focus_area']}
+5. **Geographic Context**: Note relevant country/regional patterns
+6. **Methodology**: Briefly mention study designs and sample sizes when relevant
+FORMAT: Use clear prose without bullet points. Include specific citations and key statistics.
+LENGTH: {max_length} maximum.
+Write a focused synthesis that directly addresses: "{query}" """
+        try:
+            time.sleep(self.config.rate_limit_delay)  # Rate limiting
+            model = genai.GenerativeModel("gemini-1.5-flash")
+            response = model.generate_content(synthesis_prompt)
+            return response.text
+        except Exception as e:
+            self.logger.error(f"Synthesis generation failed: {e}")
+            return f"Error creating synthesis: {e}"
+    def _build_studies_context(self, studies: List[Dict]) -> str:
+        """Build concise studies context for synthesis"""
+        studies_context = ""
+        for i, study in enumerate(studies, 1):
+            # Essential metadata
+            title = study.get('title', 'Unknown Title')[:80]
+            authors = study.get('authors', 'Unknown Authors')[:50]
+            year = study.get('publication_year', study.get('research_year', 'Unknown'))
+            countries = study.get('study_countries', study.get('countries_list', 'Unknown'))[:50]
+            studies_context += f"\n[{i}] {title}\n"
+            studies_context += f"Authors: {authors} ({year}) | Countries: {countries}\n"
+            # Key methodology info
+            method_info = []
+            for field, label in [
+                ('research_design', 'Design'),
+                ('sample_size', 'N'),
+                ('rigor_score', 'Rigor')
+            ]:
+                if field in study and study[field]:
+                    value = str(study[field])
+                    if value.lower() not in ['unknown', 'nan', '']:
+                        method_info.append(f"{label}: {value}")
+            if method_info:
+                studies_context += f"Method: {' | '.join(method_info)}\n"
+            # Truncated content
+            content = study['combined_text'][:800]
+            studies_context += f"Content: {content}...\n"
+            studies_context += "-" * 60 + "\n"
+        return studies_context
+    def format_references(self, studies: List[Dict]) -> str:
+        """Format academic-style references"""
+        references = []
+        for i, study in enumerate(studies, 1):
+            title = study.get('title', 'Unknown Title')
+            authors = study.get('authors', 'Unknown Authors')
+            year = study.get('publication_year', study.get('research_year', 'Unknown'))
+            countries = study.get('study_countries', '')
+            ref = f"[{i}] {authors} ({year}). {title}"
+            if countries:
+                ref += f" *Countries: {countries}*"
+            if study.get('max_relevance'):
+                ref += f" *Relevance: {study['max_relevance']:.3f}*"
+            references.append(ref)
+        return "\n\n".join(references)
+    def answer_research_question(self, query: str) -> Dict[str, any]:
+        """Main method to answer research questions with domain checking"""
+        self.logger.info(f"Processing query: '{query}'")
+        # Validate query length
+        if len(query.strip()) < 3:
+            return {
+                'answer': "Query too short. Please provide a more detailed research question.",
+                'references': "",
+                'study_count': 0,
+                'quality': "invalid",
+                'suggestions': []
+            }
+        # Check domain relevance
+        is_relevant, confidence, reason = self.query_analyzer.is_domain_relevant(query)
+        # Update the suggestions in answer_research_question method
+        if not is_relevant:
+            suggestions = [
+                "What sampling strategies work best in conflict-affected areas?",
+                "How do researchers ensure data quality during active conflict?",
+                "What are the ethical considerations for RCTs in fragile states?",
+                "How do studies handle attrition bias in longitudinal FCAS research?",
+                "What proxy measures are used when direct measurement is impossible?",
+                "How do researchers adapt survey instruments for low-literacy populations?",
+                "What methods are used to track mobile populations in conflict zones?",
+                "How do studies establish counterfactuals in fragile settings?"
+            ]
+            return {
+                'answer': f"This query appears to be outside the scope of development research in fragile and conflict-affected settings.\n\nReason: {reason}\n\nThis database contains research on development, health, education, governance, and policy interventions in FCAS contexts.",
+                'references': "",
+                'study_count': 0,
+                'quality': "out_of_scope",
+                'suggestions': suggestions
+            }
+        # Analyze query type
+        query_analysis = self.query_analyzer.analyze_query_type(query)
+        # Search for relevant chunks
+        relevant_chunks = self.search_relevant_chunks(query)
+        if not relevant_chunks:
+            return {
+                'answer': "No relevant studies found. This might be due to technical issues or very specific query terms.",
+                'references': "",
+                'study_count': 0,
+                'quality': "no_results",
+                'suggestions': ["Try broader search terms", "Check spelling", "Use more general concepts"]
+            }
+        # Group by studies
+        studies_dict = self.group_by_studies(relevant_chunks)
+        self.logger.info(f"Found {len(studies_dict)} unique studies")
+        # Filter and rank studies
+        top_studies, quality_message = self.filter_and_rank_studies(studies_dict, query)
+        if len(top_studies) < self.config.min_studies:
+            return {
+                'answer': f"Found {len(studies_dict)} studies but only {len(top_studies)} met relevance criteria.\n\n{quality_message}\n\nTry using broader search terms or different keywords.",
+                'references': "",
+                'study_count': len(studies_dict),
+                'quality': "insufficient",
+                'suggestions': ["Use broader terms", "Try synonyms", "Focus on general concepts"]
+            }
+        # Create synthesis
+        self.logger.info(f"Synthesizing findings from {len(top_studies)} studies")
+        synthesis = self.create_synthesis(query, top_studies, query_analysis)
+        references = self.format_references(top_studies)
+        # Determine overall quality
+        avg_relevance = np.mean([s['max_relevance'] for s in top_studies])
+        if avg_relevance >= self.config.min_relevance_strict:
+            quality = "high"
+        elif avg_relevance >= self.config.min_relevance_moderate:
+            quality = "moderate"
+        else:
+            quality = "low"
+        return {
+            'answer': synthesis,
+            'references': references,
+            'study_count': len(top_studies),
+            'quality': quality,
+            'quality_message': quality_message,
+            'query_analysis': query_analysis,
+            'suggestions': []
+        }
+# Update the test queries section in the main() function of synthesis_qa_backend.py
+def main():
+    """Test the improved system"""
+    import os
+    # Configuration
+    config = SynthesisConfig(
+        top_k=25,
+        min_relevance_strict=0.65,
+        min_relevance_moderate=0.55,
+        min_relevance_threshold=0.50,
+        max_studies=8,
+        min_studies=3
+    )
+    api_key = os.environ.get("GOOGLE_API_KEY", "your_api_key_here")
+    try:
+        synthesizer = ResearchSynthesizer(
+            index_path="research_chunks.faiss",
+            metadata_path="chunk_metadata.csv",
+            api_key=api_key,
+            config=config,
+            log_level=logging.INFO
+        )
+        test_queries = [
+            "what sampling strategies work best in conflict zones?",
+            "how do researchers ensure data quality during active conflict?",
+            "what are ethical considerations for randomized trials in fragile states?",
+            "how do studies handle attrition bias in FCAS research?",
+            "what proxy measures are used when direct measurement is impossible?",
+            "how do researchers adapt survey instruments for low-literacy populations?",
+            "who won the world cup in 2022?",  # Should be rejected (non-research)
+        ]
+        for query in test_queries:
+            print("\n" + "="*80)
+            print(f"QUERY: {query}")
+            print("="*80)
+            result = synthesizer.answer_research_question(query)
+            print(f"Quality: {result['quality']}")
+            print(f"Studies: {result['study_count']}")
+            print("\nAnswer:")
+            print(result['answer'])
+            if result['references']:
+                print("\nReferences:")
+                print(result['references'])
+            if result['suggestions']:
+                print("\nSuggestions:")
+                for suggestion in result['suggestions']:
+                    print(f"  • {suggestion}")
+    except Exception as e:
+        logging.error(f"Failed to run main: {e}")
+        raise
+if __name__ == "__main__":
+    main()

synthesis_qa_backend_old.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import faiss
+import pandas as pd
+import numpy as np
+import google.generativeai as genai
+from typing import List, Dict
+from collections import defaultdict
+class ResearchSynthesizer:
+    def __init__(self, index_path: str, metadata_path: str, api_key: str):
+        """Initialize the research synthesis system"""
+        self.index = faiss.read_index(index_path)
+        self.metadata = pd.read_csv(metadata_path)
+        genai.configure(api_key=api_key)
+        print(f"🔍 Loaded {self.index.ntotal} chunks from {len(self.metadata['record_id'].unique())} documents")
+        print(f"📊 FAISS index dimensions: {self.index.d}")
+        # Check for dimension mismatch
+        test_embedding = self._create_test_embedding()
+        if test_embedding is not None:
+            embedding_dim = test_embedding.shape[1]
+            index_dim = self.index.d
+            print(f"📊 Gemini embedding dimensions: {embedding_dim}")
+            if embedding_dim != index_dim:
+                print(f"⚠️  DIMENSION MISMATCH: Gemini={embedding_dim}, FAISS={index_dim}")
+                print("🔧 Will apply dimension adjustment during search")
+                self.dimension_mismatch = True
+                self.target_dim = index_dim
+                self.source_dim = embedding_dim
+            else:
+                print("✅ Dimensions match perfectly")
+                self.dimension_mismatch = False
+        else:
+            print("⚠️  Could not test embedding dimensions")
+            self.dimension_mismatch = False
+    def _create_test_embedding(self):
+        """Create a test embedding to check dimensions"""
+        try:
+            embed_result = genai.embed_content(
+                model="models/gemini-embedding-001",  # Try the newer model first
+                content="test",
+                task_type="retrieval_query"
+            )
+            return np.array([embed_result['embedding']], dtype="float32")
+        except:
+            try:
+                # Fallback to older model
+                embed_result = genai.embed_content(
+                    model="models/gemini-embedding-001",
+                    content="test",
+                    task_type="retrieval_query"
+                )
+                return np.array([embed_result['embedding']], dtype="float32")
+            except Exception as e:
+                print(f"❌ Could not create test embedding: {e}")
+                return None
+    def _adjust_embedding_dimensions(self, embedding: np.ndarray) -> np.ndarray:
+        """Adjust embedding dimensions to match FAISS index"""
+        if not self.dimension_mismatch:
+            return embedding
+        current_dim = embedding.shape[1]
+        target_dim = self.target_dim
+        print(f"🔧 Adjusting dimensions: {current_dim} → {target_dim}")
+        if current_dim < target_dim:
+            # Pad with zeros
+            padding = np.zeros((embedding.shape[0], target_dim - current_dim), dtype="float32")
+            adjusted = np.concatenate([embedding, padding], axis=1)
+            print(f"✅ Padded embedding to {adjusted.shape[1]} dimensions")
+        elif current_dim > target_dim:
+            # Truncate
+            adjusted = embedding[:, :target_dim]
+            print(f"✅ Truncated embedding to {adjusted.shape[1]} dimensions")
+        else:
+            adjusted = embedding
+        return adjusted
+    def search_relevant_chunks(self, query: str, top_k: int = 15) -> List[Dict]:
+        """Find relevant chunks using FAISS index and Gemini embeddings API"""
+        print(f"🔍 Searching for: '{query}'")
+        # Try newer embedding model first, then fallback
+        embedding_models = [
+            "models/gemini-embedding-001"
+        ]
+        query_embedding = None
+        for model in embedding_models:
+            try:
+                print(f"🧠 Trying embedding model: {model}")
+                embed_result = genai.embed_content(
+                    model=model,
+                    content=query,
+                    task_type="retrieval_query"
+                )
+                query_embedding = np.array([embed_result['embedding']], dtype="float32")
+                print(f"✅ Embedding created with {model}: shape {query_embedding.shape}")
+                break
+            except Exception as e:
+                print(f"❌ Failed with {model}: {e}")
+                continue
+        if query_embedding is None:
+            print("❌ All embedding models failed")
+            return []
+        # Adjust dimensions if needed
+        query_embedding = self._adjust_embedding_dimensions(query_embedding)
+        print(f"🔧 Final embedding shape: {query_embedding.shape}")
+        try:
+            distances, indices = self.index.search(query_embedding, top_k)
+            print(f"📊 Search completed - found {len(indices[0])} results")
+            print(f"📊 Distances range: {distances[0].min():.4f} to {distances[0].max():.4f}")
+        except Exception as e:
+            print(f"❌ FAISS search failed: {e}")
+            return []
+        results = []
+        for distance, idx in zip(distances[0], indices[0]):
+            if idx == -1:
+                continue
+            try:
+                chunk_data = self.metadata.iloc[idx].to_dict()
+                chunk_data['similarity_score'] = float(1 / (1 + distance))
+                chunk_data['faiss_distance'] = float(distance)
+                chunk_data['faiss_index'] = int(idx)
+                results.append(chunk_data)
+            except IndexError:
+                print(f"⚠️ Invalid index {idx}, skipping")
+                continue
+        print(f"✅ Retrieved {len(results)} valid chunks")
+        # Sort by similarity score
+        results.sort(key=lambda x: x['similarity_score'], reverse=True)
+        if results:
+            best_score = results[0]['similarity_score']
+            worst_score = results[-1]['similarity_score']
+            print(f"📊 Similarity range: {worst_score:.4f} to {best_score:.4f}")
+        return results
+    def group_by_studies(self, chunks: List[Dict]) -> Dict[str, List[Dict]]:
+        """Group chunks by study/document"""
+        studies = defaultdict(list)
+        for chunk in chunks:
+            studies[chunk['record_id']].append(chunk)
+        return dict(studies)
+    def filter_and_rank_studies(self, studies: Dict[str, List[Dict]],
+                                query: str = "",
+                                min_relevance: float = 0.6,  # Lower threshold
+                                max_studies: int = 8) -> List[Dict]:
+        """Select the most relevant studies using metadata"""
+        study_summaries = []
+        print(f"🔍 Filtering {len(studies)} studies with min_relevance={min_relevance}")
+        for record_id, chunks in studies.items():
+            best_chunk = max(chunks, key=lambda x: x['similarity_score'])
+            print(f"📄 Study {record_id}: best score = {best_chunk['similarity_score']:.4f}")
+            if best_chunk['similarity_score'] < min_relevance:
+                print(f"❌ Skipping study {record_id} - below threshold")
+                continue
+            # Use a more lenient threshold for including additional chunks
+            relevant_chunks = [c for c in chunks if c['similarity_score'] > min_relevance * 0.7]
+            combined_text = "\n\n".join([c['text'] for c in relevant_chunks[:3]])
+            study_summary = {
+                'record_id': record_id,
+                'combined_text': combined_text,
+                'max_relevance': best_chunk['similarity_score'],
+                'chunk_count': len(relevant_chunks)
+            }
+            # Copy metadata excluding internal fields
+            excluded_fields = {'record_id', 'full_text', 'text', 'chunk_id', 'section',
+                             'chunk_type', 'word_count', 'faiss_distance', 'faiss_index'}
+            for key, value in best_chunk.items():
+                if key not in excluded_fields and not key.startswith('similarity'):
+                    study_summary[key] = value
+            study_summaries.append(study_summary)
+        print(f"✅ Found {len(study_summaries)} studies above threshold")
+        # Enhanced scoring function
+        def enhanced_score(study):
+            base_score = study['max_relevance']
+            query_lower = query.lower()
+            # Metadata relevance boost
+            metadata_boost = 0
+            boost_fields = [
+                'world_bank_sector', 'world_bank_subsector', 'study_countries',
+                'population', 'data_collection_method', 'analysis_type',
+                'research_design', 'topic_summary', 'countries_list'
+            ]
+            for field in boost_fields:
+                if field in study and study[field]:
+                    field_value = str(study[field]).lower()
+                    if any(word in field_value for word in query_lower.split()):
+                        metadata_boost += 0.1
+            # Quality indicators boost
+            quality_boost = 0
+            if study.get('has_randomization') == 'true':
+                quality_boost += 0.05
+            if study.get('has_validation') == 'true':
+                quality_boost += 0.03
+            if study.get('has_advanced_analysis') == 'true':
+                quality_boost += 0.03
+            try:
+                rigor_numeric = float(study.get('rigor_score', 0))
+                quality_boost += rigor_numeric * 0.01
+            except (ValueError, TypeError):
+                pass
+            final_score = base_score + metadata_boost + quality_boost
+            return final_score
+        study_summaries.sort(key=enhanced_score, reverse=True)
+        selected_studies = study_summaries[:max_studies]
+        print(f"🎯 Selected top {len(selected_studies)} studies for synthesis")
+        for i, study in enumerate(selected_studies, 1):
+            print(f"  {i}. Score: {enhanced_score(study):.4f} - {study.get('title', 'No title')[:60]}...")
+        return selected_studies
+    def create_synthesis(self, query: str, studies: List[Dict]) -> str:
+        """Create synthesized answer using all metadata"""
+        query_analysis = self.analyze_query(query)
+        # Build rich context from studies
+        studies_context = ""
+        for i, study in enumerate(studies, 1):
+            title = study.get('title', 'Unknown Title')
+            authors = study.get('authors', 'Unknown Authors')
+            year = study.get('publication_year', study.get('research_year', 'Unknown'))
+            countries = study.get('study_countries', study.get('countries_list', 'Unknown'))
+            studies_context += f"\n[Study {i}] {title}\n"
+            studies_context += f"Authors: {authors} ({year})\n"
+            studies_context += f"Countries: {countries}"
+            # Add key metadata
+            for field, label in [
+                ('world_bank_sector', 'Sector'),
+                ('world_bank_subsector', 'Subsector'),
+                ('population', 'Population'),
+                ('sample_size', 'Sample Size'),
+                ('research_design', 'Design')
+            ]:
+                if field in study and study[field] and str(study[field]).lower() not in ['unknown', 'nan', '']:
+                    studies_context += f" | {label}: {study[field]}"
+            studies_context += "\n"
+            # Add methodology information
+            method_info = []
+            for field, label in [
+                ('data_collection_method', 'Data Collection'),
+                ('analysis_type', 'Analysis'),
+                ('primary_data_techniques', 'Primary Methods'),
+                ('data_analysis_methods', 'Analysis Methods'),
+                ('research_period', 'Period')
+            ]:
+                if field in study and study[field] and str(study[field]).lower() not in ['unknown', 'nan', '']:
+                    method_info.append(f"{label}: {study[field]}")
+            if method_info:
+                studies_context += f"Methodology: {' | '.join(method_info)}\n"
+            # Add quality indicators
+            quality_info = []
+            for field, label in [
+                ('rigor_score', 'Rigor Score'),
+                ('methodological_sophistication', 'Sophistication'),
+                ('has_validation', 'Validation'),
+                ('has_randomization', 'RCT'),
+                ('has_mixed_methods', 'Mixed Methods')
+            ]:
+                if field in study and study[field] and str(study[field]).lower() not in ['unknown', 'nan', 'false', '']:
+                    if field.startswith('has_') and str(study[field]).lower() == 'true':
+                        quality_info.append(label)
+                    elif not field.startswith('has_'):
+                        quality_info.append(f"{label}: {study[field]}")
+            if quality_info:
+                studies_context += f"Quality: {' | '.join(quality_info)}\n"
+            # Add content (truncated)
+            studies_context += f"Content: {study['combined_text'][:1000]}...\n"
+            studies_context += "-" * 80 + "\n"
+        synthesis_prompt = f"""You are an expert research synthesizer analyzing studies from fragile and conflict-affected settings (FCAS).
+USER QUERY: "{query}"
+FOCUS AREA: {query_analysis['focus_area']}
+STUDIES TO SYNTHESIZE:
+{studies_context}
+SYNTHESIS INSTRUCTIONS:
+1. **Direct Answer**: Start with a clear, direct answer to the user's question
+2. **Evidence-Based**: Ground all claims in the provided studies with specific citations
+3. **Methodology Focus**: When relevant, detail research methods, sample sizes, and study designs
+4. **Geographic Context**: Highlight geographic patterns and country-specific findings
+5. **Quality Assessment**: Note study quality indicators (RCTs, sample sizes, rigor scores)
+6. **Practical Insights**: Extract actionable findings and recommendations
+7. **Knowledge Gaps**: Identify areas where more research is needed
+FORMAT:
+- Use clear section headers
+- Include specific study citations: (Author, Year)
+- Highlight key statistics and findings
+- Note methodological strengths and limitations
+- Provide geographic context where relevant
+Write a comprehensive synthesis that directly addresses the user's question while showcasing the depth and breadth of evidence from these {len(studies)} studies."""
+        try:
+            model = genai.GenerativeModel("gemini-1.5-flash")
+            response = model.generate_content(synthesis_prompt)
+            return response.text
+        except Exception as e:
+            return f"Error creating synthesis: {e}"
+    def analyze_query(self, query: str) -> Dict[str, str]:
+        """Analyze the query to determine focus area"""
+        query_lower = query.lower()
+        focus_area = "general findings"
+        if any(word in query_lower for word in ['method', 'approach', 'methodology', 'technique']):
+            focus_area = "methodological approaches"
+        elif any(word in query_lower for word in ['result', 'finding', 'outcome', 'impact', 'effect']):
+            focus_area = "key findings and outcomes"
+        elif any(word in query_lower for word in ['challenge', 'barrier', 'problem', 'issue']):
+            focus_area = "challenges and barriers"
+        elif any(word in query_lower for word in ['recommendation', 'solution', 'strategy', 'intervention']):
+            focus_area = "strategies and recommendations"
+        return {
+            'focus_area': focus_area,
+            'original_query': query
+        }
+    def format_references(self, studies: List[Dict]) -> str:
+        """Format academic-style references"""
+        references = []
+        for i, study in enumerate(studies, 1):
+            title = study.get('title', 'Unknown Title')
+            authors = study.get('authors', 'Unknown Authors')
+            year = study.get('publication_year', study.get('research_year', 'Unknown'))
+            countries = study.get('study_countries', '')
+            # Format: [1] Authors (Year). Title. Countries: X. Relevance: 0.XX
+            ref = f"[{i}] {authors} ({year}). {title}"
+            if countries:
+                ref += f" *Countries: {countries}*"
+            if study.get('max_relevance'):
+                ref += f" *Relevance: {study['max_relevance']:.3f}*"
+            references.append(ref)
+        return "\n\n".join(references)
+    def answer_research_question(self, query: str,
+                                 min_studies: int = 3,
+                                 max_studies: int = 8) -> Dict[str, str]:
+        """Main method to answer research questions"""
+        print(f"🔍 Processing query: '{query}'")
+        # Search for relevant chunks
+        relevant_chunks = self.search_relevant_chunks(query, top_k=25)
+        if not relevant_chunks:
+            return {
+                'answer': "No relevant studies found. This might be due to dimension mismatch or API issues.",
+                'references': "",
+                'study_count': 0
+            }
+        # Group by studies
+        studies_dict = self.group_by_studies(relevant_chunks)
+        print(f"📚 Found {len(studies_dict)} unique studies")
+        # Filter and rank studies with more lenient threshold
+        top_studies = self.filter_and_rank_studies(
+            studies_dict,
+            query=query,
+            max_studies=max_studies,
+            min_relevance=0.5  # More lenient threshold
+        )
+        if len(top_studies) < min_studies:
+            # If still too few, try with even lower threshold
+            top_studies = self.filter_and_rank_studies(
+                studies_dict,
+                query=query,
+                max_studies=max_studies,
+                min_relevance=0.3
+            )
+            if len(top_studies) == 0:
+                return {
+                    'answer': f"Found {len(studies_dict)} studies but none met relevance criteria. Try broader search terms.",
+                    'references': "",
+                    'study_count': len(studies_dict)
+                }
+        print(f"📚 Synthesizing findings from {len(top_studies)} studies")
+        # Create synthesis
+        synthesis = self.create_synthesis(query, top_studies)
+        references = self.format_references(top_studies)
+        return {
+            'answer': synthesis,
+            'references': references,
+            'study_count': len(top_studies)
+        }
+def main():
+    """Test the system"""
+    import os
+    api_key = os.environ.get("GOOGLE_API_KEY", "your_api_key_here")
+    synthesizer = ResearchSynthesizer(
+        index_path="research_chunks.faiss",
+        metadata_path="chunk_metadata.csv",
+        api_key=api_key
+    )
+    test_queries = [
+        "agricultural research methods",
+        "cash transfer programs",
+        "education in fragile states",
+        "health interventions"
+    ]
+    for query in test_queries:
+        print("\n" + "="*80)
+        print(f"QUERY: {query}")
+        print("="*80)
+        result = synthesizer.answer_research_question(query)
+        print(f"Studies found: {result['study_count']}")
+        print(result['answer'])
+        print(result['references'])
+        print(f"\n📊 Synthesized from {result['study_count']} studies")
+if __name__ == "__main__":
+    main()

visualisations.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from collections import Counter
+import numpy as np
+def create_world_map(docs_df):
+    """Create interactive world map showing study distribution for conflict-affected countries only"""
+    if docs_df.empty or 'study_countries' not in docs_df.columns:
+        print("No data or missing 'study_countries' column")
+        return None
+    # Define the specific countries we want to show with their study counts
+    target_countries = {
+        # Nationwide conflict
+        'Burkina Faso': 1098,
+        'Afghanistan': 697,
+        'Mali': 496,
+        'Sudan': 470,
+        'Haiti': 394,
+        'Somalia': 373,
+        'Niger': 352,
+        'Syria': 323,
+        'South Sudan': 294,
+        'Libya': 119,
+        'Palestinian Territories': 81,
+        'Central African Republic': 72,
+        # Partial conflict
+        'Iraq': 128,
+        'Nigeria': 121,
+        'Lebanon': 102,
+        'Ethiopia': 81,
+        'Democratic Republic of the Congo': 71,
+        'Cameroon': 54,
+        'Chad': 36,
+        'Mozambique': 30,
+        'Myanmar': 11
+    }
+    # Count actual studies in our dataset for these countries
+    country_counts = Counter()
+    for countries_str in docs_df['study_countries'].dropna():
+        if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
+            continue
+        countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
+        for country in countries:
+            if country in target_countries:
+                country_counts[country] += 1
+    # Use target countries with actual counts where available
+    map_data = []
+    for country, target_count in target_countries.items():
+        actual_count = country_counts.get(country, 0)
+        conflict_type = "Nationwide" if target_count > 400 else "Partial"
+        map_data.append({
+            'country': country,
+            'actual_studies': actual_count,
+            'target_studies': target_count,
+            'conflict_type': conflict_type
+        })
+    map_df = pd.DataFrame(map_data)
+    print(f"Mapping {len(map_df)} conflict-affected countries")
+    print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}")
+    # Create choropleth map using target study counts
+    fig = go.Figure(data=go.Choropleth(
+        locations=map_df['country'],
+        z=map_df['target_studies'],
+        locationmode='country names',
+        colorscale='Reds',
+        hovertemplate='<b>%{location}</b><br>' +
+                     'Studies (Target): %{z}<br>' +
+                     'Studies (In Dataset): %{customdata}<br>' +
+                     '<extra></extra>',
+        customdata=map_df['actual_studies'],
+        colorbar_title="Number of Studies"
+    ))
+    fig.update_layout(
+        title={
+            'text': 'Research Coverage: Conflict-Affected Countries',
+            'x': 0.5,
+            'xanchor': 'center',
+            'font': {'size': 18}
+        },
+        geo=dict(
+            showframe=False,
+            showcoastlines=True,
+            projection_type='natural earth'
+        ),
+        height=600,
+        width=1000
+    )
+    fig.show()
+    return fig
+def create_interactive_data_explorer(docs_df):
+    """Create an interactive data explorer for methodology analysis"""
+    if docs_df.empty:
+        print("No data available")
+        return None
+    print("=== DATASET OVERVIEW ===")
+    print(f"Total studies: {len(docs_df)}")
+    print(f"Columns available: {len(docs_df.columns)}")
+    # Key numeric columns for analysis
+    numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number']
+    categorical_cols = [
+        'world_bank_sector', 'research_design', 'data_collection_method',
+        'analysis_type', 'study_countries', 'population', 'author_income_group',
+        'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis'
+    ]
+    # Filter to existing columns
+    available_numeric = [col for col in numeric_cols if col in docs_df.columns]
+    available_categorical = [col for col in categorical_cols if col in docs_df.columns]
+    print(f"Numeric variables: {available_numeric}")
+    print(f"Categorical variables: {available_categorical}")
+    # Create summary statistics table
+    summary_data = []
+    # Numeric summaries
+    for col in available_numeric:
+        values = pd.to_numeric(docs_df[col], errors='coerce').dropna()
+        if len(values) > 0:
+            summary_data.append({
+                'Variable': col,
+                'Type': 'Numeric',
+                'Valid_Values': len(values),
+                'Missing': len(docs_df) - len(values),
+                'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}"
+            })
+    # Categorical summaries
+    for col in available_categorical:
+        values = docs_df[col].dropna()
+        if len(values) > 0:
+            unique_count = values.nunique()
+            top_category = values.value_counts().index[0] if len(values) > 0 else "None"
+            summary_data.append({
+                'Variable': col,
+                'Type': 'Categorical',
+                'Valid_Values': len(values),
+                'Missing': len(docs_df) - len(values),
+                'Summary': f"{unique_count} categories, Top: {top_category}"
+            })
+    summary_df = pd.DataFrame(summary_data)
+    # Create visualization showing data completeness
+    fig = go.Figure()
+    # Data completeness bar chart
+    fig.add_trace(go.Bar(
+        x=summary_df['Variable'],
+        y=summary_df['Valid_Values'],
+        name='Valid Values',
+        marker_color='steelblue',
+        hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>',
+        customdata=summary_df['Summary']
+    ))
+    fig.add_trace(go.Bar(
+        x=summary_df['Variable'],
+        y=summary_df['Missing'],
+        name='Missing Values',
+        marker_color='lightcoral'
+    ))
+    fig.update_layout(
+        title='Data Completeness by Variable',
+        xaxis_title='Variables',
+        yaxis_title='Number of Records',
+        barmode='stack',
+        height=500,
+        xaxis={'tickangle': 45}
+    )
+    fig.show()
+    # Print summary table
+    print("\n=== VARIABLE SUMMARY ===")
+    for _, row in summary_df.iterrows():
+        print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}")
+    return fig, summary_df
+def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'):
+    """Create a pivot table analysis with visualization"""
+    if docs_df.empty:
+        return None
+    if row_var not in docs_df.columns or col_var not in docs_df.columns:
+        print(f"Variables not found. Available: {list(docs_df.columns)}")
+        return None
+    try:
+        if value_var and value_var in docs_df.columns:
+            # Numeric aggregation
+            pivot_df = docs_df.pivot_table(
+                index=row_var,
+                columns=col_var,
+                values=value_var,
+                aggfunc=agg_func,
+                fill_value=0
+            )
+            title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}"
+        else:
+            # Count aggregation
+            pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var])
+            title = f"Study Count by {row_var} and {col_var}"
+        # Create heatmap
+        fig = px.imshow(
+            pivot_df.values,
+            x=pivot_df.columns,
+            y=pivot_df.index,
+            color_continuous_scale='Viridis',
+            title=title
+        )
+        fig.update_layout(
+            height=max(400, len(pivot_df.index) * 30),
+            width=max(600, len(pivot_df.columns) * 50)
+        )
+        fig.show()
+        print(f"\nPivot Table: {row_var} × {col_var}")
+        print(pivot_df.head(10))
+        return fig, pivot_df
+    except Exception as e:
+        print(f"Error creating pivot: {e}")
+        return None
+# Example usage functions
+def explore_methodology_patterns(docs_df):
+    """Explore common methodology patterns"""
+    if docs_df.empty:
+        return None
+    # Research design by sector
+    if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns:
+        print("=== RESEARCH DESIGN BY SECTOR ===")
+        return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design')
+def explore_data_collection(docs_df):
+    """Explore data collection patterns"""
+    if docs_df.empty:
+        return None
+    # Data collection by country income group
+    if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns:
+        print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===")
+        return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method')
+def filter_and_analyze(docs_df, **filters):
+    """Filter data and run analysis on the subset"""
+    if docs_df.empty:
+        print("No data available")
+        return None
+    filtered = docs_df.copy()
+    filter_summary = []
+    # Apply filters
+    if 'countries' in filters and filters['countries']:
+        countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']]
+        country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
+        filtered = filtered[country_mask]
+        filter_summary.append(f"Countries: {', '.join(countries)}")
+    if 'sectors' in filters and filters['sectors']:
+        sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']]
+        sector_mask = filtered['world_bank_sector'].isin(sectors)
+        filtered = filtered[sector_mask]
+        filter_summary.append(f"Sectors: {', '.join(sectors)}")
+    if 'min_year' in filters and filters['min_year']:
+        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
+        filtered = filtered[year_col >= filters['min_year']]
+        filter_summary.append(f"Year >= {filters['min_year']}")
+    if 'max_year' in filters and filters['max_year']:
+        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
+        filtered = filtered[year_col <= filters['max_year']]
+        filter_summary.append(f"Year <= {filters['max_year']}")
+    if 'has_rct' in filters and filters['has_rct']:
+        filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])]
+        filter_summary.append("RCT studies only")
+    if 'min_sample_size' in filters and filters['min_sample_size']:
+        sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce')
+        filtered = filtered[sample_col >= filters['min_sample_size']]
+        filter_summary.append(f"Sample size >= {filters['min_sample_size']}")
+    # Show results
+    print(f"=== FILTERED ANALYSIS ===")
+    print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}")
+    print(f"Studies found: {len(filtered)}/{len(docs_df)}")
+    if filtered.empty:
+        print("No studies match the criteria.")
+        return None
+    # Quick analysis of filtered data
+    if len(filtered) > 5:
+        # Show key distributions
+        if 'world_bank_sector' in filtered.columns:
+            print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}")
+        if 'research_design' in filtered.columns:
+            print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}")
+        if 'rigor_score' in filtered.columns:
+            rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna()
+            if len(rigor_scores) > 0:
+                print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}")
+    return filtered
+# Quick start function
+def quick_analysis(docs_df):
+    """Run a quick analysis of the dataset"""
+    print("Starting comprehensive data analysis...")
+    # 1. Data overview
+    explorer_fig, summary_df = create_interactive_data_explorer(docs_df)
+    # 2. Map
+    map_fig = create_world_map(docs_df)
+    # 3. Sample pivot analyses
+    if len(docs_df) > 0:
+        explore_methodology_patterns(docs_df)
+        explore_data_collection(docs_df)
+    return explorer_fig, map_fig, summary_df

visualisations_old.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from collections import Counter
+def create_world_map(docs_df):
+    """Create interactive world map showing study distribution"""
+    if docs_df.empty:
+        return None
+    # Count studies by country
+    country_counts = Counter()
+    for countries_str in docs_df['study_countries'].dropna():
+        if isinstance(countries_str, str) and countries_str.lower() != 'nan':
+            # Split multiple countries
+            countries = [c.strip() for c in countries_str.split(',')]
+            for country in countries:
+                country_counts[country] += 1
+    if not country_counts:
+        return None
+    # Create choropleth map
+    countries = list(country_counts.keys())
+    counts = list(country_counts.values())
+    fig = go.Figure(data=go.Choropleth(
+        locations=countries,
+        z=counts,
+        locationmode='country names',
+        colorscale='Viridis',
+        text=countries,
+        hovertemplate='<b>%{text}</b><br>Studies: %{z}<extra></extra>',
+        colorbar_title="Number of Studies"
+    ))
+    fig.update_layout(
+        title={
+            'text': '🌍 Global Research Coverage',
+            'x': 0.5,
+            'font': {'size': 20}
+        },
+        geo=dict(
+            showframe=False,
+            showcoastlines=True,
+            projection_type='equirectangular'
+        ),
+        height=500
+    )
+    return fig
+def create_sector_analysis(docs_df):
+    """Create sector distribution charts"""
+    if docs_df.empty:
+        return None, None
+    # Sector distribution
+    sector_counts = docs_df['world_bank_sector'].value_counts().head(10)
+    fig1 = px.bar(
+        x=sector_counts.values,
+        y=sector_counts.index,
+        orientation='h',
+        title="📊 Research by World Bank Sector",
+        labels={'x': 'Number of Studies', 'y': 'Sector'},
+        color=sector_counts.values,
+        color_continuous_scale='viridis'
+    )
+    fig1.update_layout(height=400, showlegend=False)
+    # Research design pie chart
+    design_counts = docs_df['research_design'].value_counts().head(8)
+    fig2 = px.pie(
+        values=design_counts.values,
+        names=design_counts.index,
+        title="🔬 Research Design Distribution",
+        color_discrete_sequence=px.colors.qualitative.Set3
+    )
+    fig2.update_traces(textposition='inside', textinfo='percent+label')
+    fig2.update_layout(height=400)
+    return fig1, fig2
+def create_methodology_dashboard(docs_df):
+    """Create methodology analysis dashboard"""
+    if docs_df.empty:
+        return None
+    # Create subplot figure
+    fig = make_subplots(
+        rows=2, cols=2,
+        subplot_titles=('Sample Size Distribution', 'Rigor Scores',
+                       'Data Collection Methods', 'Quality Indicators'),
+        specs=[[{"secondary_y": False}, {"secondary_y": False}],
+               [{"secondary_y": False}, {"secondary_y": False}]]
+    )
+    # Sample size histogram
+    sample_sizes = pd.to_numeric(docs_df['sample_size'], errors='coerce').dropna()
+    if not sample_sizes.empty:
+        fig.add_trace(
+            go.Histogram(x=sample_sizes, name="Sample Size", nbinsx=20),
+            row=1, col=1
+        )
+    # Rigor scores
+    rigor_scores = pd.to_numeric(docs_df['rigor_score'], errors='coerce').dropna()
+    if not rigor_scores.empty:
+        fig.add_trace(
+            go.Histogram(x=rigor_scores, name="Rigor Score", nbinsx=10),
+            row=1, col=2
+        )
+    # Data collection methods
+    data_methods = docs_df['data_collection_method'].value_counts().head(8)
+    if not data_methods.empty:
+        fig.add_trace(
+            go.Bar(x=data_methods.values, y=data_methods.index,
+                   orientation='h', name="Data Methods"),
+            row=2, col=1
+        )
+    # Quality indicators (RCT, Validation, etc.)
+    quality_data = []
+    for col in ['has_randomization', 'has_validation', 'has_mixed_methods']:
+        if col in docs_df.columns:
+            true_count = (docs_df[col] == 'true').sum()
+            quality_data.append((col.replace('has_', '').title(), true_count))
+    if quality_data:
+        labels, values = zip(*quality_data)
+        fig.add_trace(
+            go.Bar(x=list(labels), y=list(values), name="Quality Features"),
+            row=2, col=2
+        )
+    fig.update_layout(
+        height=800,
+        title_text="📈 Methodology Dashboard",
+        title_x=0.5,
+        showlegend=False
+    )
+    return fig
+def filter_studies(docs_df, countries, sectors, min_year, max_year, has_rct, min_sample_size):
+    """Filter and display studies based on criteria"""
+    if docs_df.empty:
+        return "No data available"
+    filtered = docs_df.copy()
+    # Apply filters
+    if countries:
+        country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
+        filtered = filtered[country_mask]
+    if sectors:
+        sector_mask = filtered['world_bank_sector'].isin(sectors)
+        filtered = filtered[sector_mask]
+    if min_year:
+        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
+        filtered = filtered[year_col >= min_year]
+    if max_year:
+        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
+        filtered = filtered[year_col <= max_year]
+    if has_rct:
+        filtered = filtered[filtered['has_randomization'] == 'true']
+    if min_sample_size:
+        sample_col = pd.to_numeric(filtered['sample_size'], errors='coerce')
+        filtered = filtered[sample_col >= min_sample_size]
+    # Display results
+    if filtered.empty:
+        return "No studies match the selected criteria."
+    # Create summary
+    result = f"## 🔍 Filtered Results: {len(filtered)} studies\n\n"
+    # Show sample of results
+    display_cols = ['title', 'authors', 'publication_year', 'study_countries',
+                   'world_bank_sector', 'research_design', 'sample_size']
+    available_cols = [col for col in display_cols if col in filtered.columns]
+    sample_df = filtered[available_cols].head(10)
+    result += sample_df.to_markdown(index=False)
+    if len(filtered) > 10:
+        result += f"\n\n*... and {len(filtered) - 10} more studies*"
+    return result
+def get_unique_values(docs_df):
+    """Extract unique countries and sectors for dropdowns"""
+    countries_list = []
+    sectors_list = []
+    if not docs_df.empty:
+        # Extract unique countries
+        for countries_str in docs_df['study_countries'].dropna():
+            if isinstance(countries_str, str) and countries_str.lower() != 'nan':
+                countries = [c.strip() for c in countries_str.split(',')]
+                countries_list.extend(countries)
+        countries_list = sorted(list(set(countries_list)))
+        # Extract unique sectors
+        sectors_list = sorted(docs_df['world_bank_sector'].dropna().unique().tolist())
+    return countries_list, sectors_list