lsempe commited on
Commit
9c062cd
Β·
0 Parent(s):

Clean repo, remove binary history

Browse files
.gitattributes ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chunk_metadata.csv filter=lfs diff=lfs merge=lfs -text
37
+ research_chunks.faiss filter=lfs diff=lfs merge=lfs -text
38
+ *.csv filter=lfs diff=lfs merge=lfs -text
39
+ *.faiss filter=lfs diff=lfs merge=lfs -text
40
+ *.pdf filter=lfs diff=lfs merge=lfs -text
41
+ 3ie[[:space:]]colours[[:space:]]proof_edited[[:space:]]03-02-2022.pdf filter=lfs diff=lfs merge=lfs -text
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Research Q&A System
3
+ emoji: πŸ”¬
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # πŸ”¬ Advanced Research Q&A System
14
+
15
+ An intelligent research synthesis system that provides AI-powered answers across thousands of academic documents with rich metadata analysis.
16
+
17
+ ## ✨ Features
18
+
19
+ ### πŸ€– AI-Powered Research Synthesis
20
+ - **Semantic Search**: Uses Google's Gemini embeddings to find semantically relevant studies
21
+ - **Multi-Study Synthesis**: Combines findings across multiple research papers
22
+ - **Academic Citations**: Properly formatted references with rich context
23
+ - **Query Intelligence**: Understands methodology, findings, challenges, and recommendation queries
24
+
25
+ ### πŸ“Š Rich Metadata Analysis
26
+ - **35+ Metadata Fields**: Comprehensive research characteristics
27
+ - **Quality Metrics**: Rigor scores, validation status, methodological sophistication
28
+ - **Geographic Coverage**: Global research with interactive maps
29
+ - **Sector Analysis**: World Bank sectors and subsectors
30
+
31
+ ### 🌍 Interactive Visualizations
32
+ - **Global Research Map**: See study distribution worldwide
33
+ - **Methodology Dashboard**: Analyze research designs, sample sizes, data collection methods
34
+ - **Sector Distribution**: Understand research focus areas
35
+ - **Advanced Filtering**: Filter by country, sector, year, sample size, RCT status
36
+
37
+ ## πŸš€ How to Use
38
+
39
+ ### 1. **Ask Questions**
40
+ Enter research questions like:
41
+ - "What methods were used in agricultural research in Yemen?"
42
+ - "How do cash transfer programs impact poverty reduction?"
43
+ - "What are the main challenges in education programs in fragile states?"
44
+
45
+ ### 2. **Explore Data**
46
+ - View global research distribution on interactive maps
47
+ - Analyze methodology patterns and quality metrics
48
+ - Filter studies by multiple criteria
49
+
50
+ ### 3. **Get Synthesized Answers**
51
+ Receive comprehensive answers that:
52
+ - Synthesize findings across multiple studies
53
+ - Include proper academic citations
54
+ - Highlight methodological approaches
55
+ - Note sample sizes and study quality
56
+
57
+ ## πŸ“ˆ Data Coverage
58
+
59
+ - **Geographic**: Studies from fragile and conflict-affected states worldwide
60
+ - **Sectors**: Agriculture, education, health, governance, economics, and more
61
+ - **Methods**: RCTs, observational studies, mixed methods, qualitative research
62
+ - **Quality**: Rigor-scored studies with validation information
63
+
64
+ ## πŸ› οΈ Technology Stack
65
+
66
+ - **AI**: Google Gemini for embeddings and text generation
67
+ - **Search**: FAISS vector database for semantic search
68
+ - **UI**: Gradio for interactive web interface
69
+ - **Visualization**: Plotly and Folium for rich charts and maps
70
+ - **Data**: Pandas for metadata analysis
71
+
72
+ ## πŸ“Š Example Output
73
+
74
+ **Query**: "What methods were used in agricultural research in Yemen?"
75
+
76
+ **Answer**: "Across the studies in agricultural development in Yemen, we find three primary methodological approaches with varying rigor scores. Two randomized controlled trials with sample sizes of 1,200 and 800 households employed structured survey instruments and experimental protocols [1, 3]. Community-based participatory research was extensively used in irrigation studies, with rigor scores above 7.5 and validation through multiple data sources [2, 4]..."
77
+
78
+ **References**:
79
+ - [1] Smith, J., Ahmed, M. (2023). Participatory Water Management in Rural Communities. Countries: Yemen | Sector: Agriculture (Irrigation) | Design: RCT (n=1,200) | Quality: RCT, Validated, Rigor: 8.2
80
+
81
+ ## πŸ”§ Setup for Development
82
+
83
+ 1. Clone the repository
84
+ 2. Install requirements: `pip install -r requirements.txt`
85
+ 3. Add your Google API key as environment variable: `GOOGLE_API_KEY`
86
+ 4. Place your FAISS index and metadata files in the root directory
87
+ 5. Run: `python app.py`
88
+
89
+ ## πŸ“ License
90
+
91
+ Apache 2.0 License
92
+
93
+ ## 🀝 Contributing
94
+
95
+ Contributions welcome! This system can be extended to other research domains and document collections.
96
+
97
+ ---
__pycache__/synthesis_qa_backend.cpython-312.pyc ADDED
Binary file (15.2 kB). View file
 
__pycache__/visualisations.cpython-312.pyc ADDED
Binary file (8.8 kB). View file
 
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # app.py (main entry point)
3
+ # =============================================================================
4
+ import logging
5
+ import os
6
+
7
+ # Ensure logs folder exists
8
+ os.makedirs("logs", exist_ok=True)
9
+
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.DEBUG, # or INFO if you want less noise
13
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
14
+ handlers=[
15
+ logging.FileHandler("logs/app.log", mode="a"), # Log to file
16
+ logging.StreamHandler() # Still show in Spaces Logs tab
17
+ ]
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Example usage
23
+ logger.info("App started")
24
+ logger.debug("This is debug info")
25
+
26
+
27
+
28
+ import gradio as gr
29
+ from data_handler import DataHandler
30
+ from gradio_callbacks import GradioCallbacks
31
+ from gradio_components import (
32
+ create_header, create_qa_tab, create_overview_tab,
33
+ create_methodology_tab, create_pivot_tab, create_filters_tab, create_about_tab
34
+ )
35
+
36
+ def main():
37
+ # Initialize data
38
+ data_handler = DataHandler()
39
+ callbacks = GradioCallbacks(data_handler)
40
+ data = data_handler.get_data()
41
+
42
+ # Create Gradio app
43
+ with gr.Blocks(
44
+ theme=gr.themes.Monochrome(),
45
+ title="FCAS Research Methods Evidence Mapping",
46
+ css="""
47
+ .gradio-container {
48
+ max-width: 1200px !important;
49
+ }
50
+ .main-header {
51
+ text-align: center;
52
+ color: white;
53
+ padding: 2rem;
54
+ border-radius: 10px;
55
+ margin-bottom: 2rem;
56
+ }
57
+ """
58
+ ) as app:
59
+
60
+ # Header
61
+ create_header()
62
+
63
+ with gr.Tabs():
64
+ create_qa_tab(callbacks)
65
+ create_overview_tab(callbacks)
66
+ create_methodology_tab(callbacks)
67
+ create_pivot_tab(callbacks)
68
+ create_filters_tab(callbacks, data['countries_list'], data['sectors_list'])
69
+ create_about_tab()
70
+
71
+ return app
72
+
73
+ if __name__ == "__main__":
74
+ app = main()
75
+ app.launch(
76
+ share=True,
77
+ server_name="0.0.0.0",
78
+ server_port=7860,
79
+ show_error=True
80
+ )
app_debug.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Temporary app.py for debugging Hugging Face Space issues
4
+ Replace your current app.py with this file temporarily
5
+ """
6
+
7
+ import os
8
+ import pandas as pd
9
+ import faiss
10
+ import google.generativeai as genai
11
+ import numpy as np
12
+ import gradio as gr
13
+
14
+ def run_all_checks():
15
+ """Run all diagnostic checks and return results"""
16
+ results = []
17
+
18
+ def add_result(text):
19
+ results.append(text)
20
+ print(text)
21
+
22
+ add_result("πŸš€ DEBUGGING HUGGING FACE SPACE")
23
+ add_result("=" * 50)
24
+
25
+ # Environment Check
26
+ add_result("\nπŸ” ENVIRONMENT CHECK")
27
+ add_result("=" * 30)
28
+
29
+ # Check API key variations
30
+ api_keys = {
31
+ "GOOGLE_API_KEY": os.environ.get("GOOGLE_API_KEY"),
32
+ "gemini_api": os.environ.get("gemini_api"),
33
+ "GEMINI_API_KEY": os.environ.get("GEMINI_API_KEY"),
34
+ }
35
+
36
+ found_key = None
37
+ for key_name, key_value in api_keys.items():
38
+ if key_value:
39
+ add_result(f"βœ… {key_name}: {key_value[:10]}...")
40
+ found_key = key_value
41
+ else:
42
+ add_result(f"❌ {key_name}: Not found")
43
+
44
+ if not found_key:
45
+ add_result("❌ No API key found in any expected environment variable")
46
+ return "\n".join(results)
47
+
48
+ # Test Gemini API
49
+ try:
50
+ genai.configure(api_key=found_key)
51
+ add_result("βœ… Gemini API configured successfully")
52
+ except Exception as e:
53
+ add_result(f"❌ Gemini API configuration failed: {e}")
54
+ return "\n".join(results)
55
+
56
+ # File Check
57
+ add_result("\nπŸ“ FILE CHECK")
58
+ add_result("=" * 30)
59
+
60
+ add_result(f"Current directory: {os.getcwd()}")
61
+ add_result(f"Directory contents: {os.listdir('.')}")
62
+
63
+ files_to_check = [
64
+ "research_chunks.faiss",
65
+ "chunk_metadata.csv",
66
+ "requirements.txt"
67
+ ]
68
+
69
+ all_files_exist = True
70
+ for file_path in files_to_check:
71
+ if os.path.exists(file_path):
72
+ size = os.path.getsize(file_path)
73
+ add_result(f"βœ… {file_path}: {size:,} bytes")
74
+ else:
75
+ add_result(f"❌ {file_path}: NOT FOUND")
76
+ if file_path in ["research_chunks.faiss", "chunk_metadata.csv"]:
77
+ all_files_exist = False
78
+
79
+ if not all_files_exist:
80
+ add_result("\n❌ CRITICAL: Missing required data files!")
81
+ add_result("You need to upload:")
82
+ add_result("- research_chunks.faiss (FAISS vector index)")
83
+ add_result("- chunk_metadata.csv (document metadata)")
84
+ return "\n".join(results)
85
+
86
+ # FAISS Index Check
87
+ add_result("\nπŸ” FAISS INDEX CHECK")
88
+ add_result("=" * 30)
89
+
90
+ try:
91
+ index = faiss.read_index("research_chunks.faiss")
92
+ add_result(f"βœ… FAISS index loaded: {index.ntotal:,} vectors")
93
+ add_result(f"βœ… Index dimension: {index.d}")
94
+ add_result(f"βœ… Index type: {type(index).__name__}")
95
+ except Exception as e:
96
+ add_result(f"❌ FAISS index loading failed: {e}")
97
+ return "\n".join(results)
98
+
99
+ # Metadata Check
100
+ add_result("\nπŸ“Š METADATA CHECK")
101
+ add_result("=" * 30)
102
+
103
+ try:
104
+ metadata = pd.read_csv("chunk_metadata.csv")
105
+ add_result(f"βœ… Metadata loaded: {len(metadata):,} rows")
106
+ add_result(f"βœ… Columns ({len(metadata.columns)}): {list(metadata.columns)[:5]}...")
107
+ add_result(f"βœ… Unique records: {metadata['record_id'].nunique():,}")
108
+
109
+ # Check for required columns
110
+ required_cols = ['record_id', 'text', 'title']
111
+ missing_cols = [col for col in required_cols if col not in metadata.columns]
112
+ if missing_cols:
113
+ add_result(f"⚠️ Missing required columns: {missing_cols}")
114
+ else:
115
+ add_result("βœ… All required columns present")
116
+
117
+ # Show sample data
118
+ add_result("\nπŸ“ Sample data:")
119
+ for i, row in metadata.head(2).iterrows():
120
+ add_result(f"Row {i}: {row.get('title', 'No title')}")
121
+ add_result(f" Text preview: {str(row.get('text', 'No text'))[:100]}...")
122
+
123
+ except Exception as e:
124
+ add_result(f"❌ Metadata loading failed: {e}")
125
+ return "\n".join(results)
126
+
127
+ # Embedding API Test
128
+ add_result("\n🧠 EMBEDDING API TEST")
129
+ add_result("=" * 30)
130
+
131
+ try:
132
+ test_query = "agricultural research methods"
133
+ add_result(f"Testing with query: '{test_query}'")
134
+
135
+ embed_result = genai.embed_content(
136
+ model="models/embedding-001",
137
+ content=test_query,
138
+ task_type="retrieval_query"
139
+ )
140
+
141
+ embedding = np.array([embed_result['embedding']], dtype="float32")
142
+ add_result(f"βœ… Embedding created: shape {embedding.shape}")
143
+ add_result(f"βœ… First 5 values: {embedding[0][:5]}")
144
+
145
+ except Exception as e:
146
+ add_result(f"❌ Embedding API test failed: {e}")
147
+ return "\n".join(results)
148
+
149
+ # Full Search Test
150
+ add_result("\nπŸ” FULL SEARCH TEST")
151
+ add_result("=" * 30)
152
+
153
+ try:
154
+ distances, indices = index.search(embedding, k=5)
155
+ add_result(f"βœ… Search completed")
156
+ add_result(f"βœ… Indices: {indices[0]}")
157
+ add_result(f"βœ… Distances: {distances[0]}")
158
+
159
+ # Check results
160
+ valid_indices = [idx for idx in indices[0] if idx != -1 and idx < len(metadata)]
161
+ add_result(f"βœ… Valid results: {len(valid_indices)}/5")
162
+
163
+ if valid_indices:
164
+ sample_idx = valid_indices[0]
165
+ sample_row = metadata.iloc[sample_idx]
166
+ similarity = 1 / (1 + distances[0][0])
167
+ add_result(f"\nπŸ“‹ Best match (similarity: {similarity:.3f}):")
168
+ add_result(f" Title: {sample_row.get('title', 'N/A')}")
169
+ add_result(f" Text: {str(sample_row.get('text', 'N/A'))[:200]}...")
170
+
171
+ except Exception as e:
172
+ add_result(f"❌ Full search test failed: {e}")
173
+ return "\n".join(results)
174
+
175
+ # Environment Info
176
+ add_result("\n🐍 PYTHON ENVIRONMENT")
177
+ add_result("=" * 30)
178
+
179
+ import sys
180
+ add_result(f"Python version: {sys.version}")
181
+ add_result(f"Platform: {sys.platform}")
182
+
183
+ try:
184
+ import pkg_resources
185
+ installed = [pkg.project_name for pkg in pkg_resources.working_set]
186
+ required = ['gradio', 'faiss-cpu', 'google-generativeai', 'pandas', 'numpy', 'plotly']
187
+ missing = [pkg for pkg in required if pkg not in installed]
188
+ if missing:
189
+ add_result(f"⚠️ Missing packages: {missing}")
190
+ else:
191
+ add_result("βœ… All required packages installed")
192
+ except:
193
+ add_result("⚠️ Could not check installed packages")
194
+
195
+ add_result("\nπŸŽ‰ ALL TESTS COMPLETED!")
196
+ add_result("\nIf you see this message, your system should be working!")
197
+ add_result("You can now replace this debug app.py with your original app.py")
198
+
199
+ return "\n".join(results)
200
+
201
+ def create_debug_interface():
202
+ """Create a simple Gradio interface for debugging"""
203
+
204
+ with gr.Blocks(title="Debug Hugging Face Space") as app:
205
+ gr.HTML("""
206
+ <div style="text-align: center; padding: 20px; background: linear-gradient(90deg, #ff6b6b, #4ecdc4); color: white; border-radius: 10px;">
207
+ <h1>πŸ”§ Hugging Face Space Debugger</h1>
208
+ <p>This will help identify why your search isn't working</p>
209
+ </div>
210
+ """)
211
+
212
+ with gr.Row():
213
+ run_btn = gr.Button("πŸš€ Run Full Diagnostic", variant="primary", size="lg")
214
+
215
+ with gr.Row():
216
+ output = gr.Textbox(
217
+ label="Diagnostic Results",
218
+ lines=30,
219
+ max_lines=50,
220
+ interactive=False,
221
+ show_copy_button=True
222
+ )
223
+
224
+ # Auto-run diagnostics on load
225
+ app.load(run_all_checks, outputs=output)
226
+ run_btn.click(run_all_checks, outputs=output)
227
+
228
+ gr.HTML("""
229
+ <div style="margin-top: 20px; padding: 15px; background: #f0f8ff; border-radius: 5px;">
230
+ <h3>πŸ“‹ What This Checks:</h3>
231
+ <ul>
232
+ <li><strong>API Key:</strong> Verifies Google Gemini API key is set correctly</li>
233
+ <li><strong>Files:</strong> Checks if FAISS index and metadata CSV exist</li>
234
+ <li><strong>Data:</strong> Validates file contents and structure</li>
235
+ <li><strong>Search:</strong> Tests the complete search pipeline</li>
236
+ <li><strong>Environment:</strong> Verifies Python packages and setup</li>
237
+ </ul>
238
+ <p><strong>Next Steps:</strong> Once all tests pass, replace this debug app.py with your original app.py</p>
239
+ </div>
240
+ """)
241
+
242
+ return app
243
+
244
+ if __name__ == "__main__":
245
+ # Run diagnostics in console first
246
+ print("Running initial diagnostics...")
247
+ run_all_checks()
248
+
249
+ # Launch Gradio interface
250
+ app = create_debug_interface()
251
+ app.launch(
252
+ share=True,
253
+ server_name="0.0.0.0",
254
+ server_port=7860,
255
+ show_error=True
256
+ )
app_old.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # app.py (main entry point)
3
+ # =============================================================================
4
+ import gradio as gr
5
+ from data_handler import DataHandler
6
+ from gradio_callbacks import GradioCallbacks
7
+ from gradio_components import (
8
+ create_header, create_qa_tab, create_overview_tab,
9
+ create_methodology_tab, create_filters_tab, create_about_tab
10
+ )
11
+
12
+ def main():
13
+ # Initialize data
14
+ data_handler = DataHandler()
15
+ callbacks = GradioCallbacks(data_handler)
16
+ data = data_handler.get_data()
17
+
18
+ # Create Gradio app
19
+ with gr.Blocks(
20
+ theme=gr.themes.Monochrome(),
21
+ title="FCAS Research Methods Evidence Mapping",
22
+ css="""
23
+ .gradio-container {
24
+ max-width: 1200px !important;
25
+ }
26
+ .main-header {
27
+ text-align: center;
28
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
29
+ color: white;
30
+ padding: 2rem;
31
+ border-radius: 10px;
32
+ margin-bottom: 2rem;
33
+ }
34
+ """
35
+ ) as app:
36
+
37
+ # Header
38
+ create_header()
39
+
40
+ with gr.Tabs():
41
+ create_qa_tab(callbacks)
42
+ create_overview_tab(callbacks)
43
+ create_methodology_tab(callbacks)
44
+ create_filters_tab(callbacks, data['countries_list'], data['sectors_list'])
45
+ create_about_tab()
46
+
47
+ return app
48
+
49
+ if __name__ == "__main__":
50
+ app = main()
51
+ app.launch(
52
+ share=True,
53
+ server_name="0.0.0.0",
54
+ server_port=7860,
55
+ show_error=True
56
+ )
chunk_metadata.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e78c3fdc52942e2bd98529d920258ccb378a5b2ec2ef82afb6617dbc48d15ae
3
+ size 202374040
config.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # API Configuration
4
+ API_KEY = os.environ.get("GOOGLE_API_KEY", "your_api_key_here")
5
+ INDEX_PATH = "research_chunks.faiss"
6
+ METADATA_PATH = "chunk_metadata.csv"
7
+
8
+ # Countries to include in analysis
9
+ SPECIFIC_COUNTRIES = [
10
+ "Burkina Faso", "Afghanistan", "Mali", "Sudan", "Haiti", "Somalia",
11
+ "Niger", "Syria", "South Sudan", "Libya", "Palestinian Territories",
12
+ "Central African Republic", "Iraq", "Nigeria", "Lebanon", "Ethiopia",
13
+ "Democratic Republic of the Congo", "Cameroon", "Chad", "Mozambique", "Myanmar"
14
+ ]
15
+
16
+ # UI Configuration
17
+ APP_TITLE = "AI-powered chatbot"
18
+ #APP_DESCRIPTION = "AI synthesis across thousands of research documents"
19
+
20
+ # Default values
21
+ DEFAULT_MAX_STUDIES = 6
22
+ DEFAULT_MIN_RELEVANCE = 0.7
23
+ DEFAULT_MIN_YEAR = 2015
24
+ DEFAULT_MAX_YEAR = 2025
data_handler.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from synthesis_qa_backend import ResearchSynthesizer
4
+ from config import API_KEY, INDEX_PATH, METADATA_PATH, SPECIFIC_COUNTRIES
5
+
6
+ class DataHandler:
7
+ def __init__(self):
8
+ self.synthesizer = None
9
+ self.docs_df = pd.DataFrame()
10
+ self.countries_list = []
11
+ self.sectors_list = []
12
+ self.load_data()
13
+
14
+ def load_data(self):
15
+ """Initialize the research system and load data"""
16
+ try:
17
+ self.synthesizer = ResearchSynthesizer(INDEX_PATH, METADATA_PATH, API_KEY)
18
+ metadata_df = pd.read_csv(METADATA_PATH)
19
+ self.docs_df = metadata_df.drop_duplicates(subset=['record_id'])
20
+ print(f"βœ… Loaded {len(self.docs_df)} unique documents")
21
+
22
+ # Get unique values for dropdowns
23
+ self.countries_list, self.sectors_list = self._get_unique_values()
24
+
25
+ except Exception as e:
26
+ print(f"❌ Error loading system: {e}")
27
+ self.synthesizer = None
28
+ self.docs_df = pd.DataFrame()
29
+
30
+ def _get_unique_values(self):
31
+ """Get unique values for dropdowns"""
32
+ if self.docs_df.empty:
33
+ return [], []
34
+
35
+ countries_list = []
36
+ sectors_list = []
37
+
38
+ if 'study_countries' in self.docs_df.columns:
39
+ for countries_str in self.docs_df['study_countries'].dropna():
40
+ if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
41
+ continue
42
+ countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
43
+ filtered = [c for c in countries if c in SPECIFIC_COUNTRIES and len(c) > 1]
44
+ countries_list.extend(filtered)
45
+
46
+ countries_list = sorted(list(set(countries_list)))
47
+
48
+ if 'world_bank_sector' in self.docs_df.columns:
49
+ sectors_list = sorted(self.docs_df['world_bank_sector'].dropna().unique().tolist())
50
+
51
+ return countries_list, sectors_list
52
+
53
+ def get_data(self):
54
+ """Return all data objects"""
55
+ return {
56
+ 'synthesizer': self.synthesizer,
57
+ 'docs_df': self.docs_df,
58
+ 'countries_list': self.countries_list,
59
+ 'sectors_list': self.sectors_list
60
+ }
gradio_callbacks.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # gradio_callbacks.py
3
+ # =============================================================================
4
+ import pandas as pd
5
+ from visualisations import (
6
+ create_world_map, create_interactive_data_explorer,
7
+ filter_and_analyze, create_pivot_analysis
8
+ )
9
+
10
+ class GradioCallbacks:
11
+ def __init__(self, data_handler):
12
+ self.data = data_handler.get_data()
13
+
14
+ def get_column_choices(self):
15
+ """Get available columns for pivot analysis"""
16
+ if self.data['docs_df'].empty:
17
+ return [], [], []
18
+
19
+ df = self.data['docs_df']
20
+
21
+ # Define potential categorical columns
22
+ categorical_cols = []
23
+ potential_categorical = [
24
+ 'world_bank_sector', 'research_design', 'data_collection_method',
25
+ 'analysis_type', 'study_countries', 'population', 'author_income_group',
26
+ 'has_validation', 'has_randomization', 'has_mixed_methods',
27
+ 'has_advanced_analysis', 'world_bank_subsector', 'topic_summary'
28
+ ]
29
+
30
+ for col in potential_categorical:
31
+ if col in df.columns and df[col].notna().sum() > 0:
32
+ # Check if column has reasonable number of unique values (2-50)
33
+ unique_count = df[col].nunique()
34
+ if 2 <= unique_count <= 50:
35
+ categorical_cols.append(col)
36
+
37
+ # Define potential numeric columns
38
+ numeric_cols = []
39
+ potential_numeric = [
40
+ 'publication_year', 'sample_numeric', 'rigor_score', 'sdg_number',
41
+ 'research_year', 'word_count'
42
+ ]
43
+
44
+ for col in potential_numeric:
45
+ if col in df.columns:
46
+ # Try to convert to numeric and check if we have valid values
47
+ numeric_values = pd.to_numeric(df[col], errors='coerce')
48
+ if numeric_values.notna().sum() > 0:
49
+ numeric_cols.append(col)
50
+
51
+ all_cols = list(df.columns)
52
+
53
+ return categorical_cols, numeric_cols, all_cols
54
+
55
+ def create_pivot_analysis(self, row_var, col_var, value_var, agg_func):
56
+ """Create pivot analysis with visualization"""
57
+ try:
58
+ from visualisations import create_pivot_analysis
59
+
60
+ if not row_var or not col_var:
61
+ return None, "Please select both row and column variables."
62
+
63
+ if self.data['docs_df'].empty:
64
+ return None, "No data available for analysis."
65
+
66
+ # Check if variables exist
67
+ df = self.data['docs_df']
68
+ if row_var not in df.columns or col_var not in df.columns:
69
+ return None, f"Selected variables not found in dataset."
70
+
71
+ # Handle value variable
72
+ if value_var == "None" or not value_var:
73
+ value_var = None
74
+ elif value_var not in df.columns:
75
+ return None, f"Value variable '{value_var}' not found in dataset."
76
+
77
+ # Create the pivot analysis
78
+ result = create_pivot_analysis(df, row_var, col_var, value_var, agg_func)
79
+
80
+ if result is None:
81
+ return None, "Could not create pivot analysis. Check your variable selections."
82
+
83
+ # Handle different return types
84
+ if isinstance(result, tuple):
85
+ fig, pivot_df = result
86
+ else:
87
+ fig = result
88
+ pivot_df = None
89
+
90
+ # Create summary text
91
+ if pivot_df is not None:
92
+ summary = f"**Pivot Analysis: {row_var} Γ— {col_var}**\n\n"
93
+ summary += f"- Rows: {len(pivot_df.index)} categories\n"
94
+ summary += f"- Columns: {len(pivot_df.columns)} categories\n"
95
+ summary += f"- Aggregation: {agg_func}\n"
96
+
97
+ if value_var:
98
+ summary += f"- Value variable: {value_var}\n"
99
+
100
+ # Add top findings
101
+ if hasattr(pivot_df, 'values'):
102
+ total_sum = pivot_df.values.sum()
103
+ summary += f"- Total: {total_sum:.0f}\n"
104
+
105
+ # Find max cell
106
+ max_idx = pivot_df.values.argmax()
107
+ max_row_idx, max_col_idx = divmod(max_idx, pivot_df.shape[1])
108
+ max_row = pivot_df.index[max_row_idx]
109
+ max_col = pivot_df.columns[max_col_idx]
110
+ max_val = pivot_df.values[max_row_idx, max_col_idx]
111
+ summary += f"- Highest value: {max_val:.1f} ({max_row} Γ— {max_col})\n"
112
+ else:
113
+ summary = f"Pivot analysis completed for {row_var} Γ— {col_var}"
114
+
115
+ return fig, summary
116
+
117
+ except Exception as e:
118
+ error_msg = f"Error creating pivot analysis: {str(e)}"
119
+ print(error_msg) # For debugging
120
+ return None, error_msg
121
+
122
+ def create_overview_plots(self):
123
+ """Create overview plots for the Global Overview tab"""
124
+ try:
125
+ import plotly.express as px
126
+
127
+ world_map = create_world_map(self.data['docs_df'])
128
+
129
+ # Simple sector analysis
130
+ if 'world_bank_sector' in self.data['docs_df'].columns:
131
+ sector_counts = self.data['docs_df']['world_bank_sector'].value_counts().head(10)
132
+ sector_plot = px.bar(
133
+ x=sector_counts.values,
134
+ y=sector_counts.index,
135
+ orientation='h',
136
+ title="Studies by World Bank Sector",
137
+ labels={'x': 'Number of Studies', 'y': 'Sector'}
138
+ )
139
+ else:
140
+ sector_plot = None
141
+
142
+ if 'research_design' in self.data['docs_df'].columns:
143
+ design_counts = self.data['docs_df']['research_design'].value_counts().head(8)
144
+ design_plot = px.pie(
145
+ values=design_counts.values,
146
+ names=design_counts.index,
147
+ title="Research Design Distribution"
148
+ )
149
+ else:
150
+ design_plot = None
151
+
152
+ return world_map, sector_plot, design_plot
153
+
154
+ except Exception as e:
155
+ print(f"Error creating overview plots: {e}")
156
+ return None, None, None
157
+
158
+ def create_methodology_analysis(self):
159
+ result = create_interactive_data_explorer(self.data['docs_df'])
160
+ if isinstance(result, tuple):
161
+ return result[0] # Return only the figure
162
+ else:
163
+ return result
164
+
165
+ def filter_studies(self, countries, sectors, min_year, max_year, rct_only, min_sample):
166
+ """Filter studies and return formatted results"""
167
+ if self.data['docs_df'].empty:
168
+ return "No data available for filtering."
169
+
170
+ try:
171
+ filters = {}
172
+
173
+ if countries:
174
+ filters['countries'] = countries
175
+ if sectors:
176
+ filters['sectors'] = sectors
177
+ if min_year:
178
+ filters['min_year'] = int(min_year)
179
+ if max_year:
180
+ filters['max_year'] = int(max_year)
181
+ if rct_only:
182
+ filters['has_rct'] = True
183
+ if min_sample and min_sample > 0:
184
+ filters['min_sample_size'] = int(min_sample)
185
+
186
+ filtered_df = filter_and_analyze(self.data['docs_df'], **filters)
187
+
188
+ if filtered_df is None or filtered_df.empty:
189
+ return "No studies match your filters."
190
+
191
+ # Format results
192
+ results = []
193
+ display_cols = ['title', 'authors', 'publication_year', 'study_countries',
194
+ 'world_bank_sector', 'research_design', 'sample_numeric']
195
+
196
+ for _, row in filtered_df.head(20).iterrows():
197
+ result_parts = []
198
+
199
+ if 'title' in row:
200
+ result_parts.append(f"### {row['title']}")
201
+
202
+ if 'authors' in row:
203
+ result_parts.append(f"- **Authors**: {row['authors']}")
204
+
205
+ if 'publication_year' in row:
206
+ result_parts.append(f"- **Year**: {row['publication_year']}")
207
+
208
+ if 'study_countries' in row:
209
+ result_parts.append(f"- **Countries**: {row['study_countries']}")
210
+
211
+ if 'world_bank_sector' in row:
212
+ result_parts.append(f"- **Sector**: {row['world_bank_sector']}")
213
+
214
+ if 'research_design' in row:
215
+ result_parts.append(f"- **Design**: {row['research_design']}")
216
+
217
+ if 'sample_numeric' in row and pd.notna(row['sample_numeric']):
218
+ result_parts.append(f"- **Sample Size**: {int(row['sample_numeric'])}")
219
+
220
+ if 'has_randomization' in row:
221
+ rct_status = 'Yes' if str(row['has_randomization']).lower() in ['true', 'yes', '1'] else 'No'
222
+ result_parts.append(f"- **RCT**: {rct_status}")
223
+
224
+ results.append('\n'.join(result_parts))
225
+
226
+ result_text = '\n\n'.join(results)
227
+
228
+ if len(filtered_df) > 20:
229
+ result_text += f"\n\n*... and {len(filtered_df) - 20} more studies*"
230
+
231
+ return result_text
232
+
233
+ except Exception as e:
234
+ return f"Error filtering studies: {e}"
235
+
236
+ def answer_question(self, question):
237
+ """Answer research question with synthesis"""
238
+ if not self.data['synthesizer']:
239
+ return "⚠️ System not initialized. Please check configuration.", "", ""
240
+
241
+ if not question.strip():
242
+ return "Please enter a research question.", "", ""
243
+
244
+ try:
245
+ result = self.data['synthesizer'].answer_research_question(query=question)
246
+
247
+ # Handle different response types from new system
248
+ if result['quality'] == 'out_of_scope':
249
+ suggestions_text = ""
250
+ if result['suggestions']:
251
+ suggestions_text = "\n\n**πŸ’‘ Try queries like:**\n" + "\n".join([f"β€’ {s}" for s in result['suggestions']])
252
+
253
+ return f"⚠️ {result['answer']}{suggestions_text}", "", ""
254
+
255
+ if result['quality'] == 'no_results':
256
+ return "No relevant studies found. Try a broader query or different keywords.", "", ""
257
+
258
+ if result['quality'] == 'insufficient':
259
+ return f"⚠️ {result['answer']}", "", ""
260
+
261
+ if result['study_count'] == 0:
262
+ return "No relevant studies found. Try a broader query.", "", ""
263
+
264
+ # Format the successful response
265
+ quality_indicator = {
266
+ 'high': '🟒 High Quality',
267
+ 'moderate': '🟑 Moderate Quality',
268
+ 'low': 'πŸ”΄ Low Quality'
269
+ }.get(result['quality'], '')
270
+
271
+ answer = f"## πŸ” Research Synthesis {quality_indicator}\n\n{result['answer']}"
272
+
273
+ references = f"## πŸ“š References\n\n{result['references']}"
274
+
275
+ # Enhanced stats with quality info
276
+ stats = f"**Studies analyzed:** {result['study_count']}"
277
+ if 'quality_message' in result:
278
+ stats += f"\n**Quality:** {result['quality_message']}"
279
+
280
+ return answer, references, stats
281
+
282
+ except Exception as e:
283
+ return f"Error processing query: {str(e)}", "", ""
gradio_callbacks_old.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # =============================================================================
3
+ # gradio_callbacks.py
4
+ # =============================================================================
5
+ import pandas as pd
6
+ from visualisations import (
7
+ create_world_map, create_interactive_data_explorer,
8
+ filter_and_analyze
9
+ )
10
+
11
+ class GradioCallbacks:
12
+ def __init__(self, data_handler):
13
+ self.data = data_handler.get_data()
14
+
15
+ def answer_question(self, question, max_studies, min_relevance):
16
+ """Answer research question with synthesis"""
17
+ if not self.data['synthesizer']:
18
+ return "❌ System not initialized. Please check configuration.", "", ""
19
+
20
+ if not question.strip():
21
+ return "Please enter a research question.", "", ""
22
+
23
+ try:
24
+ result = self.data['synthesizer'].answer_research_question(
25
+ query=question,
26
+ min_studies=2,
27
+ max_studies=int(max_studies)
28
+ )
29
+
30
+ if result['study_count'] == 0:
31
+ return "No relevant studies found. Try a broader query.", "", ""
32
+
33
+ answer = f"## πŸ“ Research Synthesis\n\n{result['answer']}"
34
+ references = f"## πŸ“š References\n\n{result['references']}"
35
+ stats = f"**Studies analyzed:** {result['study_count']}"
36
+
37
+ return answer, references, stats
38
+
39
+ except Exception as e:
40
+ return f"Error processing query: {e}", "", ""
41
+
42
+ def create_overview_plots(self):
43
+ """Create overview plots for the Global Overview tab"""
44
+ try:
45
+ import plotly.express as px
46
+
47
+ world_map = create_world_map(self.data['docs_df'])
48
+
49
+ # Simple sector analysis
50
+ if 'world_bank_sector' in self.data['docs_df'].columns:
51
+ sector_counts = self.data['docs_df']['world_bank_sector'].value_counts().head(10)
52
+ sector_plot = px.bar(
53
+ x=sector_counts.values,
54
+ y=sector_counts.index,
55
+ orientation='h',
56
+ title="Studies by World Bank Sector",
57
+ labels={'x': 'Number of Studies', 'y': 'Sector'}
58
+ )
59
+ else:
60
+ sector_plot = None
61
+
62
+ if 'research_design' in self.data['docs_df'].columns:
63
+ design_counts = self.data['docs_df']['research_design'].value_counts().head(8)
64
+ design_plot = px.pie(
65
+ values=design_counts.values,
66
+ names=design_counts.index,
67
+ title="Research Design Distribution"
68
+ )
69
+ else:
70
+ design_plot = None
71
+
72
+ return world_map, sector_plot, design_plot
73
+
74
+ except Exception as e:
75
+ print(f"Error creating overview plots: {e}")
76
+ return None, None, None
77
+
78
+ def create_methodology_analysis(self):
79
+ result = create_interactive_data_explorer(self.data['docs_df'])
80
+ if isinstance(result, tuple):
81
+ return result[0] # Return only the figure
82
+ else:
83
+ return result
84
+
85
+ def filter_studies(self, countries, sectors, min_year, max_year, rct_only, min_sample):
86
+ """Filter studies and return formatted results"""
87
+ if self.data['docs_df'].empty:
88
+ return "No data available for filtering."
89
+
90
+ try:
91
+ filters = {}
92
+
93
+ if countries:
94
+ filters['countries'] = countries
95
+ if sectors:
96
+ filters['sectors'] = sectors
97
+ if min_year:
98
+ filters['min_year'] = int(min_year)
99
+ if max_year:
100
+ filters['max_year'] = int(max_year)
101
+ if rct_only:
102
+ filters['has_rct'] = True
103
+ if min_sample and min_sample > 0:
104
+ filters['min_sample_size'] = int(min_sample)
105
+
106
+ filtered_df = filter_and_analyze(self.data['docs_df'], **filters)
107
+
108
+ if filtered_df is None or filtered_df.empty:
109
+ return "No studies match your filters."
110
+
111
+ # Format results
112
+ results = []
113
+ display_cols = ['title', 'authors', 'publication_year', 'study_countries',
114
+ 'world_bank_sector', 'research_design', 'sample_numeric']
115
+
116
+ for _, row in filtered_df.head(20).iterrows():
117
+ result_parts = []
118
+
119
+ if 'title' in row:
120
+ result_parts.append(f"### {row['title']}")
121
+
122
+ if 'authors' in row:
123
+ result_parts.append(f"- **Authors**: {row['authors']}")
124
+
125
+ if 'publication_year' in row:
126
+ result_parts.append(f"- **Year**: {row['publication_year']}")
127
+
128
+ if 'study_countries' in row:
129
+ result_parts.append(f"- **Countries**: {row['study_countries']}")
130
+
131
+ if 'world_bank_sector' in row:
132
+ result_parts.append(f"- **Sector**: {row['world_bank_sector']}")
133
+
134
+ if 'research_design' in row:
135
+ result_parts.append(f"- **Design**: {row['research_design']}")
136
+
137
+ if 'sample_numeric' in row and pd.notna(row['sample_numeric']):
138
+ result_parts.append(f"- **Sample Size**: {int(row['sample_numeric'])}")
139
+
140
+ if 'has_randomization' in row:
141
+ rct_status = 'Yes' if str(row['has_randomization']).lower() in ['true', 'yes', '1'] else 'No'
142
+ result_parts.append(f"- **RCT**: {rct_status}")
143
+
144
+ results.append('\n'.join(result_parts))
145
+
146
+ result_text = '\n\n'.join(results)
147
+
148
+ if len(filtered_df) > 20:
149
+ result_text += f"\n\n*... and {len(filtered_df) - 20} more studies*"
150
+
151
+ return result_text
152
+
153
+ except Exception as e:
154
+ return f"Error filtering studies: {e}"
gradio_components.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # gradio_components.py
3
+ # =============================================================================
4
+ import gradio as gr
5
+ from config import APP_TITLE, DEFAULT_MAX_STUDIES, DEFAULT_MIN_RELEVANCE, DEFAULT_MIN_YEAR, DEFAULT_MAX_YEAR
6
+
7
+
8
+ def create_overview_tab(callbacks):
9
+ """Create the Global Overview tab"""
10
+ with gr.Tab("🌍 Global Overview", id="overview"):
11
+ gr.Markdown("## Research Landscape Analysis")
12
+
13
+ with gr.Row():
14
+ world_map_plot = gr.Plot(label="Global Research Distribution")
15
+
16
+ with gr.Row():
17
+ sector_plot = gr.Plot(label="Sector Analysis")
18
+ design_plot = gr.Plot(label="Research Designs")
19
+
20
+ overview_btn = gr.Button("πŸ”„ Load Overview", variant="secondary")
21
+ overview_btn.click(
22
+ callbacks.create_overview_plots,
23
+ outputs=[world_map_plot, sector_plot, design_plot]
24
+ )
25
+
26
+ def create_methodology_tab(callbacks):
27
+ """Create the Methodology Dashboard tab"""
28
+ with gr.Tab("πŸ“Š Methodology Dashboard", id="methods"):
29
+ gr.Markdown("## Deep Dive into Research Methods & Quality")
30
+
31
+ methodology_plot = gr.Plot(label="Data Completeness")
32
+
33
+ methodology_btn = gr.Button("πŸ”„ Load Analysis", variant="secondary")
34
+ methodology_btn.click(
35
+ callbacks.create_methodology_analysis,
36
+ outputs=[methodology_plot]
37
+ )
38
+
39
+ def create_filters_tab(callbacks, countries_list, sectors_list):
40
+ """Create the Advanced Search tab"""
41
+ with gr.Tab("πŸ” Advanced Search", id="filters"):
42
+ gr.Markdown("## Filter and Explore Studies")
43
+
44
+ with gr.Row():
45
+ with gr.Column():
46
+ country_filter = gr.Dropdown(
47
+ choices=countries_list,
48
+ label="Countries",
49
+ multiselect=True,
50
+ interactive=True
51
+ )
52
+
53
+ sector_filter = gr.Dropdown(
54
+ choices=sectors_list,
55
+ label="World Bank Sectors",
56
+ multiselect=True,
57
+ interactive=True
58
+ )
59
+
60
+ with gr.Column():
61
+ min_year_filter = gr.Number(
62
+ label="Minimum Publication Year",
63
+ value=DEFAULT_MIN_YEAR,
64
+ precision=0
65
+ )
66
+
67
+ max_year_filter = gr.Number(
68
+ label="Maximum Publication Year",
69
+ value=DEFAULT_MAX_YEAR,
70
+ precision=0
71
+ )
72
+
73
+ with gr.Column():
74
+ rct_filter = gr.Checkbox(
75
+ label="Only Randomized Controlled Trials",
76
+ value=False
77
+ )
78
+
79
+ min_sample_filter = gr.Number(
80
+ label="Minimum Sample Size",
81
+ value=None,
82
+ precision=0
83
+ )
84
+
85
+ filter_btn = gr.Button("πŸ” Apply Filters", variant="primary")
86
+
87
+ filtered_results = gr.Markdown(label="Filtered Studies")
88
+
89
+ filter_btn.click(
90
+ callbacks.filter_studies,
91
+ inputs=[country_filter, sector_filter, min_year_filter,
92
+ max_year_filter, rct_filter, min_sample_filter],
93
+ outputs=filtered_results
94
+ )
95
+
96
+ def create_about_tab():
97
+ """Create the About tab"""
98
+ with gr.Tab("ℹ️ About", id="about"):
99
+ gr.Markdown("""
100
+ ## About This Research Q&A System
101
+
102
+ This system provides intelligent synthesis across thousands of research documents using:
103
+
104
+ ### πŸ€– **AI-Powered Analysis**
105
+ - **Semantic Search**: Uses Google's Gemini embeddings to find relevant studies
106
+ - **Smart Synthesis**: Combines findings across multiple studies with proper citations
107
+ - **Rich Metadata**: Leverages 35+ metadata fields per document
108
+
109
+ ### πŸ“Š **Research Quality Metrics**
110
+ - **Rigor Scores**: Methodological quality assessment
111
+ - **Study Design**: RCTs, observational studies, mixed methods
112
+ - **Validation Status**: Peer review and replication information
113
+
114
+ ### 🌍 **Global Coverage**
115
+ - Studies from fragile and conflict-affected states
116
+ - Multiple World Bank sectors and regions
117
+ - Comprehensive geographic and temporal coverage
118
+
119
+ ### πŸ”¬ **Advanced Features**
120
+ - Interactive visualizations and maps
121
+ - Advanced filtering and search capabilities
122
+ - Academic-style citations with full context
123
+
124
+ ---
125
+
126
+ **Data Sources**: Research documents from development economics, impact evaluation, and policy studies
127
+
128
+ **Technology**: Built with Gradio, FAISS, Google Gemini AI, and Plotly
129
+ """)
130
+
131
+ def create_pivot_tab(callbacks):
132
+ """Create the Pivot Analysis tab"""
133
+ with gr.Tab("πŸ“Š Pivot Analysis", id="pivot"):
134
+ gr.Markdown("## Interactive Pivot Table Analysis")
135
+ gr.Markdown("Create cross-tabulations and pivot tables to explore relationships in the data")
136
+
137
+ with gr.Row():
138
+ with gr.Column():
139
+ # Get column choices
140
+ categorical_cols, numeric_cols, all_cols = callbacks.get_column_choices()
141
+
142
+ row_var = gr.Dropdown(
143
+ choices=categorical_cols,
144
+ label="Row Variable",
145
+ value=categorical_cols[0] if categorical_cols else None,
146
+ interactive=True
147
+ )
148
+
149
+ col_var = gr.Dropdown(
150
+ choices=categorical_cols,
151
+ label="Column Variable",
152
+ value=categorical_cols[1] if len(categorical_cols) > 1 else None,
153
+ interactive=True
154
+ )
155
+
156
+ value_var = gr.Dropdown(
157
+ choices=[None] + numeric_cols,
158
+ label="Value Variable (optional - for numeric aggregation)",
159
+ value=None,
160
+ interactive=True
161
+ )
162
+
163
+ agg_func = gr.Dropdown(
164
+ choices=["count", "mean", "sum"],
165
+ label="Aggregation Function",
166
+ value="count",
167
+ interactive=True
168
+ )
169
+
170
+ pivot_btn = gr.Button("πŸ”„ Create Pivot Analysis", variant="primary")
171
+
172
+ with gr.Column():
173
+ gr.Markdown("### πŸ’‘ Suggested Analyses")
174
+ gr.Markdown("""
175
+ **Popular combinations:**
176
+ - Research Design Γ— World Bank Sector (count)
177
+ - Countries Γ— Has Randomization (count)
178
+ - Author Income Group Γ— Data Collection Method (count)
179
+ - Research Design Γ— Rigor Score (mean)
180
+ - World Bank Sector Γ— Sample Size (mean)
181
+
182
+ **Tips:**
183
+ - Use 'count' to see frequency distributions
184
+ - Use 'mean' or 'sum' with numeric value variables
185
+ - Choose variables with reasonable number of categories
186
+ """)
187
+
188
+ with gr.Row():
189
+ pivot_plot = gr.Plot(label="Pivot Heatmap")
190
+
191
+ with gr.Row():
192
+ pivot_summary = gr.Markdown(label="Pivot Table Summary")
193
+
194
+ pivot_btn.click(
195
+ callbacks.create_pivot_analysis,
196
+ inputs=[row_var, col_var, value_var, agg_func],
197
+ outputs=[pivot_plot, pivot_summary]
198
+ )
199
+
200
+ def create_qa_tab(callbacks):
201
+ """Create the Q&A tab"""
202
+ with gr.Tab("πŸ€– Ask Questions", id="qa"):
203
+ gr.Markdown("## Ask questions about research methods, findings, or approaches")
204
+
205
+ with gr.Row():
206
+ with gr.Column(scale=2):
207
+ question_input = gr.Textbox(
208
+ label="Research Question",
209
+ placeholder="e.g., 'What sampling strategies work best in conflict-affected areas?'",
210
+ lines=3
211
+ )
212
+
213
+
214
+ submit_btn = gr.Button("πŸ” Search & Synthesize", variant="primary", size="lg")
215
+
216
+ with gr.Column(scale=1):
217
+ gr.Markdown("### πŸ’‘ Example Methodological Questions")
218
+ gr.Markdown("""
219
+ - What sampling strategies work best in conflict-affected areas?
220
+ - How do researchers ensure data quality during active conflict?
221
+ - What are the ethical considerations for RCTs in fragile states?
222
+ - How do researchers adapt survey instruments for low-literacy populations?
223
+ - What methods are used to track mobile populations in FCAS?
224
+ - How do studies address attrition bias in longitudinal FCAS research?
225
+ - What proxy measures are used when direct measurement is impossible?
226
+ - How do researchers validate self-reported data in conflict settings?
227
+ - What approaches work for establishing counterfactuals in FCAS?
228
+ - How do studies handle missing data due to displacement or conflict?
229
+ """)
230
+
231
+ # Results
232
+ with gr.Row():
233
+ answer_output = gr.Markdown(label="Synthesis")
234
+
235
+ with gr.Row():
236
+ references_output = gr.Markdown(label="References")
237
+
238
+ stats_output = gr.Markdown(label="Statistics")
239
+
240
+ submit_btn.click(
241
+ callbacks.answer_question,
242
+ inputs=[question_input],
243
+ outputs=[answer_output, references_output, stats_output]
244
+ )
gradio_components_old.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # gradio_components.py
3
+ # =============================================================================
4
+ import gradio as gr
5
+ from config import APP_TITLE, APP_DESCRIPTION, DEFAULT_MAX_STUDIES, DEFAULT_MIN_RELEVANCE, DEFAULT_MIN_YEAR, DEFAULT_MAX_YEAR
6
+
7
+ def create_header():
8
+ """Create the app header"""
9
+ return gr.HTML(f"""
10
+ <div class="main-header">
11
+ <h1>{APP_TITLE}</h1>
12
+ <p>{APP_DESCRIPTION}</p>
13
+ </div>
14
+ """)
15
+
16
+ def create_qa_tab(callbacks):
17
+ """Create the Q&A tab"""
18
+ with gr.Tab("πŸ€– Ask Questions", id="qa"):
19
+ gr.Markdown("## Ask questions about research methods, findings, or approaches")
20
+
21
+ with gr.Row():
22
+ with gr.Column(scale=2):
23
+ question_input = gr.Textbox(
24
+ label="Research Question",
25
+ placeholder="e.g., 'What methods were used in agricultural research in Yemen?'",
26
+ lines=3
27
+ )
28
+
29
+ with gr.Row():
30
+ max_studies = gr.Slider(
31
+ label="Max Studies to Analyze",
32
+ minimum=3,
33
+ maximum=10,
34
+ value=DEFAULT_MAX_STUDIES,
35
+ step=1
36
+ )
37
+
38
+ min_relevance = gr.Slider(
39
+ label="Minimum Relevance Score",
40
+ minimum=0.5,
41
+ maximum=0.9,
42
+ value=DEFAULT_MIN_RELEVANCE,
43
+ step=0.05
44
+ )
45
+
46
+ submit_btn = gr.Button("πŸ” Search & Synthesize", variant="primary", size="lg")
47
+
48
+ with gr.Column(scale=1):
49
+ gr.Markdown("### πŸ’‘ Example Questions")
50
+ gr.Markdown("""
51
+ - What methods were used in agricultural research in Yemen?
52
+ - How do cash transfer programs impact poverty reduction?
53
+ - What are the main challenges in education programs in fragile states?
54
+ - What evaluation approaches are used for health interventions?
55
+ - Which countries have the most rigorous impact evaluations?
56
+ """)
57
+
58
+ # Results
59
+ with gr.Row():
60
+ answer_output = gr.Markdown(label="Synthesis")
61
+
62
+ with gr.Row():
63
+ references_output = gr.Markdown(label="References")
64
+
65
+ stats_output = gr.Markdown(label="Statistics")
66
+
67
+ submit_btn.click(
68
+ callbacks.answer_question,
69
+ inputs=[question_input, max_studies, min_relevance],
70
+ outputs=[answer_output, references_output, stats_output]
71
+ )
72
+
73
+ def create_overview_tab(callbacks):
74
+ """Create the Global Overview tab"""
75
+ with gr.Tab("🌍 Global Overview", id="overview"):
76
+ gr.Markdown("## Research Landscape Analysis")
77
+
78
+ with gr.Row():
79
+ world_map_plot = gr.Plot(label="Global Research Distribution")
80
+
81
+ with gr.Row():
82
+ sector_plot = gr.Plot(label="Sector Analysis")
83
+ design_plot = gr.Plot(label="Research Designs")
84
+
85
+ overview_btn = gr.Button("πŸ”„ Load Overview", variant="secondary")
86
+ overview_btn.click(
87
+ callbacks.create_overview_plots,
88
+ outputs=[world_map_plot, sector_plot, design_plot]
89
+ )
90
+
91
+ def create_methodology_tab(callbacks):
92
+ """Create the Methodology Dashboard tab"""
93
+ with gr.Tab("πŸ“Š Methodology Dashboard", id="methods"):
94
+ gr.Markdown("## Deep Dive into Research Methods & Quality")
95
+
96
+ methodology_plot = gr.Plot(label="Data Completeness")
97
+
98
+ methodology_btn = gr.Button("πŸ”„ Load Analysis", variant="secondary")
99
+ methodology_btn.click(
100
+ callbacks.create_methodology_analysis,
101
+ outputs=[methodology_plot]
102
+ )
103
+
104
+ def create_filters_tab(callbacks, countries_list, sectors_list):
105
+ """Create the Advanced Search tab"""
106
+ with gr.Tab("πŸ” Advanced Search", id="filters"):
107
+ gr.Markdown("## Filter and Explore Studies")
108
+
109
+ with gr.Row():
110
+ with gr.Column():
111
+ country_filter = gr.Dropdown(
112
+ choices=countries_list,
113
+ label="Countries",
114
+ multiselect=True,
115
+ interactive=True
116
+ )
117
+
118
+ sector_filter = gr.Dropdown(
119
+ choices=sectors_list,
120
+ label="World Bank Sectors",
121
+ multiselect=True,
122
+ interactive=True
123
+ )
124
+
125
+ with gr.Column():
126
+ min_year_filter = gr.Number(
127
+ label="Minimum Publication Year",
128
+ value=DEFAULT_MIN_YEAR,
129
+ precision=0
130
+ )
131
+
132
+ max_year_filter = gr.Number(
133
+ label="Maximum Publication Year",
134
+ value=DEFAULT_MAX_YEAR,
135
+ precision=0
136
+ )
137
+
138
+ with gr.Column():
139
+ rct_filter = gr.Checkbox(
140
+ label="Only Randomized Controlled Trials",
141
+ value=False
142
+ )
143
+
144
+ min_sample_filter = gr.Number(
145
+ label="Minimum Sample Size",
146
+ value=None,
147
+ precision=0
148
+ )
149
+
150
+ filter_btn = gr.Button("πŸ” Apply Filters", variant="primary")
151
+
152
+ filtered_results = gr.Markdown(label="Filtered Studies")
153
+
154
+ filter_btn.click(
155
+ callbacks.filter_studies,
156
+ inputs=[country_filter, sector_filter, min_year_filter,
157
+ max_year_filter, rct_filter, min_sample_filter],
158
+ outputs=filtered_results
159
+ )
160
+
161
+ def create_about_tab():
162
+ """Create the About tab"""
163
+ with gr.Tab("ℹ️ About", id="about"):
164
+ gr.Markdown("""
165
+ ## About This Research Q&A System
166
+
167
+ This system provides intelligent synthesis across thousands of research documents using:
168
+
169
+ ### πŸ€– **AI-Powered Analysis**
170
+ - **Semantic Search**: Uses Google's Gemini embeddings to find relevant studies
171
+ - **Smart Synthesis**: Combines findings across multiple studies with proper citations
172
+ - **Rich Metadata**: Leverages 35+ metadata fields per document
173
+
174
+ ### πŸ“Š **Research Quality Metrics**
175
+ - **Rigor Scores**: Methodological quality assessment
176
+ - **Study Design**: RCTs, observational studies, mixed methods
177
+ - **Validation Status**: Peer review and replication information
178
+
179
+ ### 🌍 **Global Coverage**
180
+ - Studies from fragile and conflict-affected states
181
+ - Multiple World Bank sectors and regions
182
+ - Comprehensive geographic and temporal coverage
183
+
184
+ ### πŸ”¬ **Advanced Features**
185
+ - Interactive visualizations and maps
186
+ - Advanced filtering and search capabilities
187
+ - Academic-style citations with full context
188
+
189
+ ---
190
+
191
+ **Data Sources**: Research documents from development economics, impact evaluation, and policy studies
192
+
193
+ **Technology**: Built with Gradio, FAISS, Google Gemini AI, and Plotly
194
+ """)
195
+
196
+ def create_pivot_tab(callbacks):
197
+ """Create the Pivot Analysis tab"""
198
+ with gr.Tab("πŸ“Š Pivot Analysis", id="pivot"):
199
+ gr.Markdown("## Interactive Pivot Table Analysis")
200
+ gr.Markdown("Create cross-tabulations and pivot tables to explore relationships in the data")
201
+
202
+ with gr.Row():
203
+ with gr.Column():
204
+ # Get column choices
205
+ categorical_cols, numeric_cols, all_cols = callbacks.get_column_choices()
206
+
207
+ row_var = gr.Dropdown(
208
+ choices=categorical_cols,
209
+ label="Row Variable",
210
+ value=categorical_cols[0] if categorical_cols else None,
211
+ interactive=True
212
+ )
213
+
214
+ col_var = gr.Dropdown(
215
+ choices=categorical_cols,
216
+ label="Column Variable",
217
+ value=categorical_cols[1] if len(categorical_cols) > 1 else None,
218
+ interactive=True
219
+ )
220
+
221
+ value_var = gr.Dropdown(
222
+ choices=[None] + numeric_cols,
223
+ label="Value Variable (optional - for numeric aggregation)",
224
+ value=None,
225
+ interactive=True
226
+ )
227
+
228
+ agg_func = gr.Dropdown(
229
+ choices=["count", "mean", "sum"],
230
+ label="Aggregation Function",
231
+ value="count",
232
+ interactive=True
233
+ )
234
+
235
+ pivot_btn = gr.Button("πŸ”„ Create Pivot Analysis", variant="primary")
236
+
237
+ with gr.Column():
238
+ gr.Markdown("### πŸ’‘ Suggested Analyses")
239
+ gr.Markdown("""
240
+ **Popular combinations:**
241
+ - Research Design Γ— World Bank Sector (count)
242
+ - Countries Γ— Has Randomization (count)
243
+ - Author Income Group Γ— Data Collection Method (count)
244
+ - Research Design Γ— Rigor Score (mean)
245
+ - World Bank Sector Γ— Sample Size (mean)
246
+
247
+ **Tips:**
248
+ - Use 'count' to see frequency distributions
249
+ - Use 'mean' or 'sum' with numeric value variables
250
+ - Choose variables with reasonable number of categories
251
+ """)
252
+
253
+ with gr.Row():
254
+ pivot_plot = gr.Plot(label="Pivot Heatmap")
255
+
256
+ with gr.Row():
257
+ pivot_summary = gr.Markdown(label="Pivot Table Summary")
258
+
259
+ pivot_btn.click(
260
+ callbacks.create_pivot_analysis,
261
+ inputs=[row_var, col_var, value_var, agg_func],
262
+ outputs=[pivot_plot, pivot_summary]
263
+ )
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.44.1
2
+ pandas>=1.5.0
3
+ numpy>=1.21.0
4
+ plotly>=5.15.0
5
+ folium>=0.14.0
6
+ faiss-cpu>=1.7.4
7
+ google-generativeai>=0.3.0
8
+ scikit-learn>=1.3.0
9
+ tabulate>=0.9.0
research_chunks.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82a1b861e56617f09458ac02fc58e5d70927f888b34af515043c1e047cfa644b
3
+ size 65310765
synthesis_qa_backend.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import faiss
3
+ import pandas as pd
4
+ import numpy as np
5
+ import google.generativeai as genai
6
+ from typing import List, Dict, Optional, Tuple
7
+ from collections import defaultdict
8
+ import logging
9
+ import time
10
+ from dataclasses import dataclass
11
+
12
+
13
+ @dataclass
14
+ class SynthesisConfig:
15
+ """Configuration class for research synthesis parameters"""
16
+ top_k: int = 20
17
+ min_relevance_strict: float = 0.7
18
+ min_relevance_moderate: float = 0.6
19
+ min_relevance_threshold: float = 0.55
20
+ max_studies: int = 6
21
+ min_studies: int = 4
22
+ max_synthesis_tokens: int = 4000
23
+ rate_limit_delay: float = 1.0
24
+ domain_keywords: List[str] = None
25
+
26
+ def __post_init__(self):
27
+ if self.domain_keywords is None:
28
+ self.domain_keywords = [
29
+ 'development', 'health', 'education', 'governance', 'poverty',
30
+ 'conflict', 'fragile', 'intervention', 'policy', 'evaluation',
31
+ 'impact', 'program', 'research', 'study', 'analysis', 'survey'
32
+ ]
33
+
34
+ class QueryAnalyzer:
35
+ """Analyzes queries to determine relevance to research domain"""
36
+
37
+ def __init__(self, config: SynthesisConfig):
38
+ self.config = config
39
+
40
+ def is_domain_relevant(self, query: str) -> Tuple[bool, float, str]:
41
+ """
42
+ Check if query is relevant to research domain
43
+ Returns: (is_relevant, confidence_score, reason)
44
+ """
45
+ query_lower = query.lower()
46
+
47
+ # Check for obvious non-research queries
48
+ non_research_patterns = [
49
+ 'who won', 'world cup', 'sports', 'entertainment', 'celebrity',
50
+ 'weather', 'stock price', 'cryptocurrency', 'movie', 'music',
51
+ 'recipe', 'cooking', 'fashion', 'shopping', 'games', 'gaming'
52
+ ]
53
+
54
+ for pattern in non_research_patterns:
55
+ if pattern in query_lower:
56
+ return False, 0.1, f"Query contains non-research pattern: '{pattern}'"
57
+
58
+ # Check for domain relevance - multiple approaches
59
+ domain_matches = sum(1 for keyword in self.config.domain_keywords
60
+ if keyword in query_lower)
61
+
62
+ # Research question patterns (even without domain keywords)
63
+ research_patterns = [
64
+ 'what methods', 'what approaches', 'how do', 'how to',
65
+ 'what strategies', 'what techniques', 'how can',
66
+ 'what are the', 'which methods', 'which approaches'
67
+ ]
68
+
69
+ research_pattern_matches = sum(1 for pattern in research_patterns
70
+ if pattern in query_lower)
71
+
72
+ # Methodological terms that indicate research focus
73
+ method_terms = [
74
+ 'method', 'approach', 'strategy', 'technique', 'measure',
75
+ 'measurement', 'data', 'sample', 'study', 'research',
76
+ 'analysis', 'evaluation', 'assessment', 'design'
77
+ ]
78
+
79
+ method_matches = sum(1 for term in method_terms if term in query_lower)
80
+
81
+ # Calculate total relevance score
82
+ total_score = domain_matches + (research_pattern_matches * 2) + method_matches
83
+
84
+ if total_score == 0:
85
+ return False, 0.3, "No domain-relevant keywords or research patterns found"
86
+
87
+ # Be more generous for methodological queries
88
+ if research_pattern_matches > 0 or method_matches >= 2:
89
+ confidence = min(0.9, 0.6 + (total_score * 0.05))
90
+ return True, confidence, f"Found research patterns and methodological terms (score: {total_score})"
91
+
92
+ if domain_matches > 0:
93
+ confidence = min(0.9, 0.5 + (domain_matches * 0.1))
94
+ return True, confidence, f"Found {domain_matches} domain-relevant keywords"
95
+
96
+ return False, 0.3, "Insufficient domain relevance"
97
+
98
+ def analyze_query_type(self, query: str) -> Dict[str, str]:
99
+ """Analyze query to determine focus area and type"""
100
+ query_lower = query.lower()
101
+
102
+ focus_area = "general findings"
103
+ query_type = "exploratory"
104
+
105
+ # Determine focus area
106
+ if any(word in query_lower for word in ['method', 'approach', 'methodology', 'technique', 'design']):
107
+ focus_area = "methodological approaches"
108
+ query_type = "methodological"
109
+ elif any(word in query_lower for word in ['result', 'finding', 'outcome', 'impact', 'effect', 'evaluation']):
110
+ focus_area = "key findings and outcomes"
111
+ query_type = "results-focused"
112
+ elif any(word in query_lower for word in ['challenge', 'barrier', 'problem', 'issue', 'difficulty']):
113
+ focus_area = "challenges and barriers"
114
+ query_type = "problem-identification"
115
+ elif any(word in query_lower for word in ['recommendation', 'solution', 'strategy', 'intervention', 'policy']):
116
+ focus_area = "strategies and recommendations"
117
+ query_type = "solution-oriented"
118
+ elif any(word in query_lower for word in ['what', 'how', 'why', 'which', 'where']):
119
+ query_type = "analytical"
120
+
121
+ # Additional FCAS-specific analysis
122
+ if any(word in query_lower for word in ['sampling', 'sample', 'recruitment', 'selection']):
123
+ focus_area = "sampling and recruitment strategies"
124
+ query_type = "methodological"
125
+ elif any(word in query_lower for word in ['data quality', 'validation', 'reliability', 'validity']):
126
+ focus_area = "data quality and validation"
127
+ query_type = "methodological"
128
+ elif any(word in query_lower for word in ['ethical', 'ethics', 'consent', 'protection']):
129
+ focus_area = "ethical considerations"
130
+ query_type = "methodological"
131
+ elif any(word in query_lower for word in ['tracking', 'mobile', 'displacement', 'attrition']):
132
+ focus_area = "population tracking and attrition"
133
+ query_type = "methodological"
134
+ elif any(word in query_lower for word in ['proxy', 'indicator', 'measurement', 'counterfactual']):
135
+ focus_area = "measurement and identification strategies"
136
+ query_type = "methodological"
137
+
138
+ return {
139
+ 'focus_area': focus_area,
140
+ 'query_type': query_type,
141
+ 'original_query': query
142
+ }
143
+
144
+ class ResearchSynthesizer:
145
+ def __init__(self, index_path: str, metadata_path: str, api_key: str,
146
+ config: Optional[SynthesisConfig] = None,
147
+ log_level: int = logging.INFO):
148
+ """Initialize the research synthesis system"""
149
+
150
+ # Setup logging
151
+ logging.basicConfig(level=log_level, format='%(asctime)s - %(levelname)s - %(message)s')
152
+ self.logger = logging.getLogger(__name__)
153
+
154
+ # Configuration
155
+ self.config = config or SynthesisConfig()
156
+ self.query_analyzer = QueryAnalyzer(self.config)
157
+
158
+ # Validate inputs
159
+ self._validate_inputs(index_path, metadata_path, api_key)
160
+
161
+ try:
162
+ # Load FAISS index and metadata
163
+ self.index = faiss.read_index(index_path)
164
+ self.metadata = pd.read_csv(metadata_path)
165
+
166
+ # Configure Gemini API
167
+ genai.configure(api_key=api_key)
168
+
169
+ self.logger.info(f"Loaded {self.index.ntotal} chunks from {len(self.metadata['record_id'].unique())} documents")
170
+ self.logger.info(f"FAISS index dimensions: {self.index.d}")
171
+
172
+ # Check dimension compatibility
173
+ self._check_dimensions()
174
+
175
+ except Exception as e:
176
+ self.logger.error(f"Failed to initialize synthesizer: {e}")
177
+ raise
178
+
179
+ def _validate_inputs(self, index_path: str, metadata_path: str, api_key: str):
180
+ """Validate input parameters"""
181
+ if not index_path or not metadata_path:
182
+ raise ValueError("Index path and metadata path must be provided")
183
+
184
+ if not api_key or api_key == "your_api_key_here":
185
+ raise ValueError("Valid API key must be provided")
186
+
187
+ if self.config.min_relevance_strict < self.config.min_relevance_moderate:
188
+ raise ValueError("Strict relevance threshold must be >= moderate threshold")
189
+
190
+ def _check_dimensions(self):
191
+ """Check embedding dimension compatibility"""
192
+ test_embedding = self._create_test_embedding()
193
+ if test_embedding is not None:
194
+ embedding_dim = test_embedding.shape[1]
195
+ index_dim = self.index.d
196
+
197
+ self.logger.info(f"Gemini embedding dimensions: {embedding_dim}")
198
+
199
+ if embedding_dim != index_dim:
200
+ self.logger.warning(f"DIMENSION MISMATCH: Gemini={embedding_dim}, FAISS={index_dim}")
201
+ self.logger.info("Will apply dimension adjustment during search")
202
+ self.dimension_mismatch = True
203
+ self.target_dim = index_dim
204
+ self.source_dim = embedding_dim
205
+ else:
206
+ self.logger.info("Dimensions match perfectly")
207
+ self.dimension_mismatch = False
208
+ else:
209
+ self.logger.warning("Could not test embedding dimensions")
210
+ self.dimension_mismatch = False
211
+
212
+ def _create_test_embedding(self) -> Optional[np.ndarray]:
213
+ """Create a test embedding to check dimensions"""
214
+ try:
215
+ time.sleep(self.config.rate_limit_delay) # Rate limiting
216
+ embed_result = genai.embed_content(
217
+ model="models/gemini-embedding-001",
218
+ content="test",
219
+ task_type="retrieval_query"
220
+ )
221
+ return np.array([embed_result['embedding']], dtype="float32")
222
+ except Exception as e:
223
+ self.logger.error(f"Could not create test embedding: {e}")
224
+ return None
225
+
226
+ def _adjust_embedding_dimensions(self, embedding: np.ndarray) -> np.ndarray:
227
+ """Adjust embedding dimensions to match FAISS index"""
228
+ if not self.dimension_mismatch:
229
+ return embedding
230
+
231
+ current_dim = embedding.shape[1]
232
+ target_dim = self.target_dim
233
+
234
+ self.logger.debug(f"Adjusting dimensions: {current_dim} β†’ {target_dim}")
235
+
236
+ if current_dim < target_dim:
237
+ # Pad with zeros
238
+ padding = np.zeros((embedding.shape[0], target_dim - current_dim), dtype="float32")
239
+ adjusted = np.concatenate([embedding, padding], axis=1)
240
+ elif current_dim > target_dim:
241
+ # Truncate (consider PCA for better semantic preservation)
242
+ adjusted = embedding[:, :target_dim]
243
+ else:
244
+ adjusted = embedding
245
+
246
+ return adjusted
247
+
248
+ def search_relevant_chunks(self, query: str) -> List[Dict]:
249
+ """Find relevant chunks using FAISS index and Gemini embeddings API"""
250
+ self.logger.info(f"Searching for: '{query}'")
251
+
252
+ try:
253
+ time.sleep(self.config.rate_limit_delay) # Rate limiting
254
+ embed_result = genai.embed_content(
255
+ model="models/gemini-embedding-001",
256
+ content=query,
257
+ task_type="retrieval_query"
258
+ )
259
+ query_embedding = np.array([embed_result['embedding']], dtype="float32")
260
+ self.logger.debug(f"Embedding created: shape {query_embedding.shape}")
261
+
262
+ except Exception as e:
263
+ self.logger.error(f"Embedding creation failed: {e}")
264
+ return []
265
+
266
+ # Adjust dimensions if needed
267
+ query_embedding = self._adjust_embedding_dimensions(query_embedding)
268
+
269
+ try:
270
+ distances, indices = self.index.search(query_embedding, self.config.top_k)
271
+ self.logger.info(f"Search completed - found {len(indices[0])} results")
272
+ self.logger.debug(f"Distance range: {distances[0].min():.4f} to {distances[0].max():.4f}")
273
+ except Exception as e:
274
+ self.logger.error(f"FAISS search failed: {e}")
275
+ return []
276
+
277
+ results = []
278
+ for distance, idx in zip(distances[0], indices[0]):
279
+ if idx == -1 or idx >= len(self.metadata):
280
+ continue
281
+
282
+ try:
283
+ chunk_data = self.metadata.iloc[idx].to_dict()
284
+ chunk_data['similarity_score'] = float(1 / (1 + distance))
285
+ chunk_data['faiss_distance'] = float(distance)
286
+ chunk_data['faiss_index'] = int(idx)
287
+ results.append(chunk_data)
288
+ except (IndexError, KeyError) as e:
289
+ self.logger.warning(f"Invalid index {idx}, skipping: {e}")
290
+ continue
291
+
292
+ # Sort by similarity score
293
+ results.sort(key=lambda x: x['similarity_score'], reverse=True)
294
+
295
+ if results:
296
+ best_score = results[0]['similarity_score']
297
+ worst_score = results[-1]['similarity_score']
298
+ self.logger.info(f"Similarity range: {worst_score:.4f} to {best_score:.4f}")
299
+
300
+ return results
301
+
302
+ def group_by_studies(self, chunks: List[Dict]) -> Dict[str, List[Dict]]:
303
+ """Group chunks by study/document"""
304
+ studies = defaultdict(list)
305
+ for chunk in chunks:
306
+ studies[chunk['record_id']].append(chunk)
307
+ return dict(studies)
308
+
309
+ def filter_and_rank_studies(self, studies: Dict[str, List[Dict]],
310
+ query: str = "") -> Tuple[List[Dict], str]:
311
+ """
312
+ Select the most relevant studies using adaptive thresholds
313
+ Returns: (selected_studies, quality_message)
314
+ """
315
+ study_summaries = []
316
+
317
+ # Determine threshold based on best available scores
318
+ all_best_scores = []
319
+ for record_id, chunks in studies.items():
320
+ best_chunk = max(chunks, key=lambda x: x['similarity_score'])
321
+ all_best_scores.append(best_chunk['similarity_score'])
322
+
323
+ if not all_best_scores:
324
+ return [], "No studies found"
325
+
326
+ max_score = max(all_best_scores)
327
+ avg_score = np.mean(all_best_scores)
328
+
329
+ # Adaptive threshold selection
330
+ if max_score >= self.config.min_relevance_strict:
331
+ threshold = self.config.min_relevance_strict
332
+ quality = "high"
333
+ elif max_score >= self.config.min_relevance_moderate:
334
+ threshold = self.config.min_relevance_moderate
335
+ quality = "moderate"
336
+ elif max_score >= self.config.min_relevance_threshold:
337
+ threshold = self.config.min_relevance_threshold
338
+ quality = "low"
339
+ else:
340
+ return [], f"No studies met minimum relevance threshold. Best score: {max_score:.3f}"
341
+
342
+ self.logger.info(f"Using {quality} quality threshold: {threshold:.3f}")
343
+
344
+ # Filter studies
345
+ for record_id, chunks in studies.items():
346
+ best_chunk = max(chunks, key=lambda x: x['similarity_score'])
347
+
348
+ if best_chunk['similarity_score'] < threshold:
349
+ continue
350
+
351
+ # Get relevant chunks with slightly lower threshold
352
+ relevant_chunks = [c for c in chunks
353
+ if c['similarity_score'] > threshold * 0.8]
354
+
355
+ # Limit text to prevent token overflow
356
+ combined_texts = [c['text'] for c in relevant_chunks[:3]]
357
+ combined_text = "\n\n".join(combined_texts)
358
+
359
+ # Truncate if too long
360
+ if len(combined_text) > 1500:
361
+ combined_text = combined_text[:1500] + "..."
362
+
363
+ study_summary = {
364
+ 'record_id': record_id,
365
+ 'combined_text': combined_text,
366
+ 'max_relevance': best_chunk['similarity_score'],
367
+ 'chunk_count': len(relevant_chunks)
368
+ }
369
+
370
+ # Copy metadata (excluding internal fields)
371
+ excluded_fields = {
372
+ 'record_id', 'full_text', 'text', 'chunk_id', 'section',
373
+ 'chunk_type', 'word_count', 'faiss_distance', 'faiss_index'
374
+ }
375
+
376
+ for key, value in best_chunk.items():
377
+ if key not in excluded_fields and not key.startswith('similarity'):
378
+ study_summary[key] = value
379
+
380
+ study_summaries.append(study_summary)
381
+
382
+ # Enhanced scoring with precomputed metadata relevance
383
+ def enhanced_score(study):
384
+ base_score = study['max_relevance']
385
+
386
+ # Metadata relevance boost (cached)
387
+ metadata_boost = self._calculate_metadata_boost(study, query)
388
+
389
+ # Quality indicators boost
390
+ quality_boost = self._calculate_quality_boost(study)
391
+
392
+ return base_score + metadata_boost + quality_boost
393
+
394
+ study_summaries.sort(key=enhanced_score, reverse=True)
395
+ selected_studies = study_summaries[:self.config.max_studies]
396
+
397
+ quality_message = f"Selected {len(selected_studies)} studies with {quality} relevance (threshold: {threshold:.3f})"
398
+
399
+ self.logger.info(quality_message)
400
+ for i, study in enumerate(selected_studies, 1):
401
+ title = study.get('title', 'No title')[:50]
402
+ score = enhanced_score(study)
403
+ self.logger.debug(f" {i}. Score: {score:.4f} - {title}...")
404
+
405
+ return selected_studies, quality_message
406
+
407
+ def _calculate_metadata_boost(self, study: Dict, query: str) -> float:
408
+ """Calculate metadata relevance boost for a study"""
409
+ query_lower = query.lower()
410
+ metadata_boost = 0
411
+
412
+ boost_fields = [
413
+ 'world_bank_sector', 'world_bank_subsector', 'study_countries',
414
+ 'population', 'data_collection_method', 'analysis_type',
415
+ 'research_design', 'topic_summary', 'countries_list'
416
+ ]
417
+
418
+ for field in boost_fields:
419
+ if field in study and study[field]:
420
+ field_value = str(study[field]).lower()
421
+ matches = sum(1 for word in query_lower.split() if word in field_value)
422
+ metadata_boost += matches * 0.05 # Smaller, more controlled boost
423
+
424
+ return min(metadata_boost, 0.2) # Cap the boost
425
+
426
+ def _calculate_quality_boost(self, study: Dict) -> float:
427
+ """Calculate quality indicator boost for a study"""
428
+ quality_boost = 0
429
+
430
+ # Boolean quality indicators
431
+ bool_indicators = {
432
+ 'has_randomization': 0.08,
433
+ 'has_validation': 0.05,
434
+ 'has_advanced_analysis': 0.03,
435
+ 'has_mixed_methods': 0.03
436
+ }
437
+
438
+ for field, boost in bool_indicators.items():
439
+ if study.get(field) == 'true':
440
+ quality_boost += boost
441
+
442
+ # Numeric quality indicators
443
+ try:
444
+ rigor_score = float(study.get('rigor_score', 0))
445
+ quality_boost += min(rigor_score * 0.02, 0.1) # Cap at 0.1
446
+ except (ValueError, TypeError):
447
+ pass
448
+
449
+ return quality_boost
450
+
451
+ def create_synthesis(self, query: str, studies: List[Dict],
452
+ query_analysis: Dict) -> str:
453
+ """Create synthesized answer with improved prompt engineering"""
454
+
455
+ # Build concise context
456
+ studies_context = self._build_studies_context(studies)
457
+
458
+ # Determine synthesis length based on study count
459
+ if len(studies) <= 3:
460
+ synthesis_style = "concise"
461
+ max_length = "2-3 paragraphs"
462
+ elif len(studies) <= 6:
463
+ synthesis_style = "balanced"
464
+ max_length = "3-4 paragraphs with clear sections"
465
+ else:
466
+ synthesis_style = "comprehensive"
467
+ max_length = "4-5 paragraphs with detailed analysis"
468
+
469
+ synthesis_prompt = f"""You are an expert research synthesizer analyzing studies from fragile and conflict-affected settings (FCAS).
470
+
471
+ USER QUERY: "{query}"
472
+ QUERY TYPE: {query_analysis['query_type']}
473
+ FOCUS AREA: {query_analysis['focus_area']}
474
+
475
+ STUDIES TO SYNTHESIZE ({len(studies)} studies):
476
+ {studies_context}
477
+
478
+ SYNTHESIS INSTRUCTIONS:
479
+ 1. **Direct Answer First**: Start with a clear, direct answer to the user's question
480
+ 2. **Evidence-Based**: Ground all claims in the provided studies with citations (Author, Year)
481
+ 3. **{synthesis_style.title()} Analysis**: Write {max_length}
482
+ 4. **Key Focus**: Emphasize {query_analysis['focus_area']}
483
+ 5. **Geographic Context**: Note relevant country/regional patterns
484
+ 6. **Methodology**: Briefly mention study designs and sample sizes when relevant
485
+
486
+ FORMAT: Use clear prose without bullet points. Include specific citations and key statistics.
487
+ LENGTH: {max_length} maximum.
488
+
489
+ Write a focused synthesis that directly addresses: "{query}" """
490
+
491
+ try:
492
+ time.sleep(self.config.rate_limit_delay) # Rate limiting
493
+ model = genai.GenerativeModel("gemini-1.5-flash")
494
+ response = model.generate_content(synthesis_prompt)
495
+ return response.text
496
+ except Exception as e:
497
+ self.logger.error(f"Synthesis generation failed: {e}")
498
+ return f"Error creating synthesis: {e}"
499
+
500
+ def _build_studies_context(self, studies: List[Dict]) -> str:
501
+ """Build concise studies context for synthesis"""
502
+ studies_context = ""
503
+
504
+ for i, study in enumerate(studies, 1):
505
+ # Essential metadata
506
+ title = study.get('title', 'Unknown Title')[:80]
507
+ authors = study.get('authors', 'Unknown Authors')[:50]
508
+ year = study.get('publication_year', study.get('research_year', 'Unknown'))
509
+ countries = study.get('study_countries', study.get('countries_list', 'Unknown'))[:50]
510
+
511
+ studies_context += f"\n[{i}] {title}\n"
512
+ studies_context += f"Authors: {authors} ({year}) | Countries: {countries}\n"
513
+
514
+ # Key methodology info
515
+ method_info = []
516
+ for field, label in [
517
+ ('research_design', 'Design'),
518
+ ('sample_size', 'N'),
519
+ ('rigor_score', 'Rigor')
520
+ ]:
521
+ if field in study and study[field]:
522
+ value = str(study[field])
523
+ if value.lower() not in ['unknown', 'nan', '']:
524
+ method_info.append(f"{label}: {value}")
525
+
526
+ if method_info:
527
+ studies_context += f"Method: {' | '.join(method_info)}\n"
528
+
529
+ # Truncated content
530
+ content = study['combined_text'][:800]
531
+ studies_context += f"Content: {content}...\n"
532
+ studies_context += "-" * 60 + "\n"
533
+
534
+ return studies_context
535
+
536
+ def format_references(self, studies: List[Dict]) -> str:
537
+ """Format academic-style references"""
538
+ references = []
539
+
540
+ for i, study in enumerate(studies, 1):
541
+ title = study.get('title', 'Unknown Title')
542
+ authors = study.get('authors', 'Unknown Authors')
543
+ year = study.get('publication_year', study.get('research_year', 'Unknown'))
544
+ countries = study.get('study_countries', '')
545
+
546
+ ref = f"[{i}] {authors} ({year}). {title}"
547
+
548
+ if countries:
549
+ ref += f" *Countries: {countries}*"
550
+
551
+ if study.get('max_relevance'):
552
+ ref += f" *Relevance: {study['max_relevance']:.3f}*"
553
+
554
+ references.append(ref)
555
+
556
+ return "\n\n".join(references)
557
+
558
+ def answer_research_question(self, query: str) -> Dict[str, any]:
559
+ """Main method to answer research questions with domain checking"""
560
+ self.logger.info(f"Processing query: '{query}'")
561
+
562
+ # Validate query length
563
+ if len(query.strip()) < 3:
564
+ return {
565
+ 'answer': "Query too short. Please provide a more detailed research question.",
566
+ 'references': "",
567
+ 'study_count': 0,
568
+ 'quality': "invalid",
569
+ 'suggestions': []
570
+ }
571
+
572
+ # Check domain relevance
573
+ is_relevant, confidence, reason = self.query_analyzer.is_domain_relevant(query)
574
+
575
+ # Update the suggestions in answer_research_question method
576
+
577
+ if not is_relevant:
578
+ suggestions = [
579
+ "What sampling strategies work best in conflict-affected areas?",
580
+ "How do researchers ensure data quality during active conflict?",
581
+ "What are the ethical considerations for RCTs in fragile states?",
582
+ "How do studies handle attrition bias in longitudinal FCAS research?",
583
+ "What proxy measures are used when direct measurement is impossible?",
584
+ "How do researchers adapt survey instruments for low-literacy populations?",
585
+ "What methods are used to track mobile populations in conflict zones?",
586
+ "How do studies establish counterfactuals in fragile settings?"
587
+ ]
588
+
589
+ return {
590
+ 'answer': f"This query appears to be outside the scope of development research in fragile and conflict-affected settings.\n\nReason: {reason}\n\nThis database contains research on development, health, education, governance, and policy interventions in FCAS contexts.",
591
+ 'references': "",
592
+ 'study_count': 0,
593
+ 'quality': "out_of_scope",
594
+ 'suggestions': suggestions
595
+ }
596
+
597
+ # Analyze query type
598
+ query_analysis = self.query_analyzer.analyze_query_type(query)
599
+
600
+ # Search for relevant chunks
601
+ relevant_chunks = self.search_relevant_chunks(query)
602
+
603
+ if not relevant_chunks:
604
+ return {
605
+ 'answer': "No relevant studies found. This might be due to technical issues or very specific query terms.",
606
+ 'references': "",
607
+ 'study_count': 0,
608
+ 'quality': "no_results",
609
+ 'suggestions': ["Try broader search terms", "Check spelling", "Use more general concepts"]
610
+ }
611
+
612
+ # Group by studies
613
+ studies_dict = self.group_by_studies(relevant_chunks)
614
+ self.logger.info(f"Found {len(studies_dict)} unique studies")
615
+
616
+ # Filter and rank studies
617
+ top_studies, quality_message = self.filter_and_rank_studies(studies_dict, query)
618
+
619
+ if len(top_studies) < self.config.min_studies:
620
+ return {
621
+ 'answer': f"Found {len(studies_dict)} studies but only {len(top_studies)} met relevance criteria.\n\n{quality_message}\n\nTry using broader search terms or different keywords.",
622
+ 'references': "",
623
+ 'study_count': len(studies_dict),
624
+ 'quality': "insufficient",
625
+ 'suggestions': ["Use broader terms", "Try synonyms", "Focus on general concepts"]
626
+ }
627
+
628
+ # Create synthesis
629
+ self.logger.info(f"Synthesizing findings from {len(top_studies)} studies")
630
+ synthesis = self.create_synthesis(query, top_studies, query_analysis)
631
+ references = self.format_references(top_studies)
632
+
633
+ # Determine overall quality
634
+ avg_relevance = np.mean([s['max_relevance'] for s in top_studies])
635
+ if avg_relevance >= self.config.min_relevance_strict:
636
+ quality = "high"
637
+ elif avg_relevance >= self.config.min_relevance_moderate:
638
+ quality = "moderate"
639
+ else:
640
+ quality = "low"
641
+
642
+ return {
643
+ 'answer': synthesis,
644
+ 'references': references,
645
+ 'study_count': len(top_studies),
646
+ 'quality': quality,
647
+ 'quality_message': quality_message,
648
+ 'query_analysis': query_analysis,
649
+ 'suggestions': []
650
+ }
651
+
652
+ # Update the test queries section in the main() function of synthesis_qa_backend.py
653
+
654
+ def main():
655
+ """Test the improved system"""
656
+ import os
657
+
658
+ # Configuration
659
+ config = SynthesisConfig(
660
+ top_k=25,
661
+ min_relevance_strict=0.65,
662
+ min_relevance_moderate=0.55,
663
+ min_relevance_threshold=0.50,
664
+ max_studies=8,
665
+ min_studies=3
666
+ )
667
+
668
+ api_key = os.environ.get("GOOGLE_API_KEY", "your_api_key_here")
669
+
670
+ try:
671
+ synthesizer = ResearchSynthesizer(
672
+ index_path="research_chunks.faiss",
673
+ metadata_path="chunk_metadata.csv",
674
+ api_key=api_key,
675
+ config=config,
676
+ log_level=logging.INFO
677
+ )
678
+
679
+ test_queries = [
680
+ "what sampling strategies work best in conflict zones?",
681
+ "how do researchers ensure data quality during active conflict?",
682
+ "what are ethical considerations for randomized trials in fragile states?",
683
+ "how do studies handle attrition bias in FCAS research?",
684
+ "what proxy measures are used when direct measurement is impossible?",
685
+ "how do researchers adapt survey instruments for low-literacy populations?",
686
+ "who won the world cup in 2022?", # Should be rejected (non-research)
687
+ ]
688
+
689
+ for query in test_queries:
690
+ print("\n" + "="*80)
691
+ print(f"QUERY: {query}")
692
+ print("="*80)
693
+
694
+ result = synthesizer.answer_research_question(query)
695
+
696
+ print(f"Quality: {result['quality']}")
697
+ print(f"Studies: {result['study_count']}")
698
+ print("\nAnswer:")
699
+ print(result['answer'])
700
+
701
+ if result['references']:
702
+ print("\nReferences:")
703
+ print(result['references'])
704
+
705
+ if result['suggestions']:
706
+ print("\nSuggestions:")
707
+ for suggestion in result['suggestions']:
708
+ print(f" β€’ {suggestion}")
709
+
710
+ except Exception as e:
711
+ logging.error(f"Failed to run main: {e}")
712
+ raise
713
+
714
+ if __name__ == "__main__":
715
+ main()
synthesis_qa_backend_old.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import pandas as pd
3
+ import numpy as np
4
+ import google.generativeai as genai
5
+ from typing import List, Dict
6
+ from collections import defaultdict
7
+
8
+
9
+ class ResearchSynthesizer:
10
+ def __init__(self, index_path: str, metadata_path: str, api_key: str):
11
+ """Initialize the research synthesis system"""
12
+ self.index = faiss.read_index(index_path)
13
+ self.metadata = pd.read_csv(metadata_path)
14
+ genai.configure(api_key=api_key)
15
+
16
+ print(f"πŸ” Loaded {self.index.ntotal} chunks from {len(self.metadata['record_id'].unique())} documents")
17
+ print(f"πŸ“Š FAISS index dimensions: {self.index.d}")
18
+
19
+ # Check for dimension mismatch
20
+ test_embedding = self._create_test_embedding()
21
+ if test_embedding is not None:
22
+ embedding_dim = test_embedding.shape[1]
23
+ index_dim = self.index.d
24
+ print(f"πŸ“Š Gemini embedding dimensions: {embedding_dim}")
25
+
26
+ if embedding_dim != index_dim:
27
+ print(f"⚠️ DIMENSION MISMATCH: Gemini={embedding_dim}, FAISS={index_dim}")
28
+ print("πŸ”§ Will apply dimension adjustment during search")
29
+ self.dimension_mismatch = True
30
+ self.target_dim = index_dim
31
+ self.source_dim = embedding_dim
32
+ else:
33
+ print("βœ… Dimensions match perfectly")
34
+ self.dimension_mismatch = False
35
+ else:
36
+ print("⚠️ Could not test embedding dimensions")
37
+ self.dimension_mismatch = False
38
+
39
+ def _create_test_embedding(self):
40
+ """Create a test embedding to check dimensions"""
41
+ try:
42
+ embed_result = genai.embed_content(
43
+ model="models/gemini-embedding-001", # Try the newer model first
44
+ content="test",
45
+ task_type="retrieval_query"
46
+ )
47
+ return np.array([embed_result['embedding']], dtype="float32")
48
+ except:
49
+ try:
50
+ # Fallback to older model
51
+ embed_result = genai.embed_content(
52
+ model="models/gemini-embedding-001",
53
+ content="test",
54
+ task_type="retrieval_query"
55
+ )
56
+ return np.array([embed_result['embedding']], dtype="float32")
57
+ except Exception as e:
58
+ print(f"❌ Could not create test embedding: {e}")
59
+ return None
60
+
61
+ def _adjust_embedding_dimensions(self, embedding: np.ndarray) -> np.ndarray:
62
+ """Adjust embedding dimensions to match FAISS index"""
63
+ if not self.dimension_mismatch:
64
+ return embedding
65
+
66
+ current_dim = embedding.shape[1]
67
+ target_dim = self.target_dim
68
+
69
+ print(f"πŸ”§ Adjusting dimensions: {current_dim} β†’ {target_dim}")
70
+
71
+ if current_dim < target_dim:
72
+ # Pad with zeros
73
+ padding = np.zeros((embedding.shape[0], target_dim - current_dim), dtype="float32")
74
+ adjusted = np.concatenate([embedding, padding], axis=1)
75
+ print(f"βœ… Padded embedding to {adjusted.shape[1]} dimensions")
76
+ elif current_dim > target_dim:
77
+ # Truncate
78
+ adjusted = embedding[:, :target_dim]
79
+ print(f"βœ… Truncated embedding to {adjusted.shape[1]} dimensions")
80
+ else:
81
+ adjusted = embedding
82
+
83
+ return adjusted
84
+
85
+ def search_relevant_chunks(self, query: str, top_k: int = 15) -> List[Dict]:
86
+ """Find relevant chunks using FAISS index and Gemini embeddings API"""
87
+ print(f"πŸ” Searching for: '{query}'")
88
+
89
+ # Try newer embedding model first, then fallback
90
+ embedding_models = [
91
+ "models/gemini-embedding-001"
92
+ ]
93
+
94
+ query_embedding = None
95
+ for model in embedding_models:
96
+ try:
97
+ print(f"🧠 Trying embedding model: {model}")
98
+ embed_result = genai.embed_content(
99
+ model=model,
100
+ content=query,
101
+ task_type="retrieval_query"
102
+ )
103
+ query_embedding = np.array([embed_result['embedding']], dtype="float32")
104
+ print(f"βœ… Embedding created with {model}: shape {query_embedding.shape}")
105
+ break
106
+ except Exception as e:
107
+ print(f"❌ Failed with {model}: {e}")
108
+ continue
109
+
110
+ if query_embedding is None:
111
+ print("❌ All embedding models failed")
112
+ return []
113
+
114
+ # Adjust dimensions if needed
115
+ query_embedding = self._adjust_embedding_dimensions(query_embedding)
116
+ print(f"πŸ”§ Final embedding shape: {query_embedding.shape}")
117
+
118
+ try:
119
+ distances, indices = self.index.search(query_embedding, top_k)
120
+ print(f"πŸ“Š Search completed - found {len(indices[0])} results")
121
+ print(f"πŸ“Š Distances range: {distances[0].min():.4f} to {distances[0].max():.4f}")
122
+ except Exception as e:
123
+ print(f"❌ FAISS search failed: {e}")
124
+ return []
125
+
126
+ results = []
127
+ for distance, idx in zip(distances[0], indices[0]):
128
+ if idx == -1:
129
+ continue
130
+ try:
131
+ chunk_data = self.metadata.iloc[idx].to_dict()
132
+ chunk_data['similarity_score'] = float(1 / (1 + distance))
133
+ chunk_data['faiss_distance'] = float(distance)
134
+ chunk_data['faiss_index'] = int(idx)
135
+ results.append(chunk_data)
136
+ except IndexError:
137
+ print(f"⚠️ Invalid index {idx}, skipping")
138
+ continue
139
+
140
+ print(f"βœ… Retrieved {len(results)} valid chunks")
141
+
142
+ # Sort by similarity score
143
+ results.sort(key=lambda x: x['similarity_score'], reverse=True)
144
+
145
+ if results:
146
+ best_score = results[0]['similarity_score']
147
+ worst_score = results[-1]['similarity_score']
148
+ print(f"πŸ“Š Similarity range: {worst_score:.4f} to {best_score:.4f}")
149
+
150
+ return results
151
+
152
+ def group_by_studies(self, chunks: List[Dict]) -> Dict[str, List[Dict]]:
153
+ """Group chunks by study/document"""
154
+ studies = defaultdict(list)
155
+ for chunk in chunks:
156
+ studies[chunk['record_id']].append(chunk)
157
+ return dict(studies)
158
+
159
+ def filter_and_rank_studies(self, studies: Dict[str, List[Dict]],
160
+ query: str = "",
161
+ min_relevance: float = 0.6, # Lower threshold
162
+ max_studies: int = 8) -> List[Dict]:
163
+ """Select the most relevant studies using metadata"""
164
+ study_summaries = []
165
+
166
+ print(f"πŸ” Filtering {len(studies)} studies with min_relevance={min_relevance}")
167
+
168
+ for record_id, chunks in studies.items():
169
+ best_chunk = max(chunks, key=lambda x: x['similarity_score'])
170
+
171
+ print(f"πŸ“„ Study {record_id}: best score = {best_chunk['similarity_score']:.4f}")
172
+
173
+ if best_chunk['similarity_score'] < min_relevance:
174
+ print(f"❌ Skipping study {record_id} - below threshold")
175
+ continue
176
+
177
+ # Use a more lenient threshold for including additional chunks
178
+ relevant_chunks = [c for c in chunks if c['similarity_score'] > min_relevance * 0.7]
179
+ combined_text = "\n\n".join([c['text'] for c in relevant_chunks[:3]])
180
+
181
+ study_summary = {
182
+ 'record_id': record_id,
183
+ 'combined_text': combined_text,
184
+ 'max_relevance': best_chunk['similarity_score'],
185
+ 'chunk_count': len(relevant_chunks)
186
+ }
187
+
188
+ # Copy metadata excluding internal fields
189
+ excluded_fields = {'record_id', 'full_text', 'text', 'chunk_id', 'section',
190
+ 'chunk_type', 'word_count', 'faiss_distance', 'faiss_index'}
191
+ for key, value in best_chunk.items():
192
+ if key not in excluded_fields and not key.startswith('similarity'):
193
+ study_summary[key] = value
194
+
195
+ study_summaries.append(study_summary)
196
+
197
+ print(f"βœ… Found {len(study_summaries)} studies above threshold")
198
+
199
+ # Enhanced scoring function
200
+ def enhanced_score(study):
201
+ base_score = study['max_relevance']
202
+ query_lower = query.lower()
203
+
204
+ # Metadata relevance boost
205
+ metadata_boost = 0
206
+ boost_fields = [
207
+ 'world_bank_sector', 'world_bank_subsector', 'study_countries',
208
+ 'population', 'data_collection_method', 'analysis_type',
209
+ 'research_design', 'topic_summary', 'countries_list'
210
+ ]
211
+
212
+ for field in boost_fields:
213
+ if field in study and study[field]:
214
+ field_value = str(study[field]).lower()
215
+ if any(word in field_value for word in query_lower.split()):
216
+ metadata_boost += 0.1
217
+
218
+ # Quality indicators boost
219
+ quality_boost = 0
220
+ if study.get('has_randomization') == 'true':
221
+ quality_boost += 0.05
222
+ if study.get('has_validation') == 'true':
223
+ quality_boost += 0.03
224
+ if study.get('has_advanced_analysis') == 'true':
225
+ quality_boost += 0.03
226
+
227
+ try:
228
+ rigor_numeric = float(study.get('rigor_score', 0))
229
+ quality_boost += rigor_numeric * 0.01
230
+ except (ValueError, TypeError):
231
+ pass
232
+
233
+ final_score = base_score + metadata_boost + quality_boost
234
+ return final_score
235
+
236
+ study_summaries.sort(key=enhanced_score, reverse=True)
237
+ selected_studies = study_summaries[:max_studies]
238
+
239
+ print(f"🎯 Selected top {len(selected_studies)} studies for synthesis")
240
+ for i, study in enumerate(selected_studies, 1):
241
+ print(f" {i}. Score: {enhanced_score(study):.4f} - {study.get('title', 'No title')[:60]}...")
242
+
243
+ return selected_studies
244
+
245
+ def create_synthesis(self, query: str, studies: List[Dict]) -> str:
246
+ """Create synthesized answer using all metadata"""
247
+ query_analysis = self.analyze_query(query)
248
+
249
+ # Build rich context from studies
250
+ studies_context = ""
251
+ for i, study in enumerate(studies, 1):
252
+ title = study.get('title', 'Unknown Title')
253
+ authors = study.get('authors', 'Unknown Authors')
254
+ year = study.get('publication_year', study.get('research_year', 'Unknown'))
255
+ countries = study.get('study_countries', study.get('countries_list', 'Unknown'))
256
+
257
+ studies_context += f"\n[Study {i}] {title}\n"
258
+ studies_context += f"Authors: {authors} ({year})\n"
259
+ studies_context += f"Countries: {countries}"
260
+
261
+ # Add key metadata
262
+ for field, label in [
263
+ ('world_bank_sector', 'Sector'),
264
+ ('world_bank_subsector', 'Subsector'),
265
+ ('population', 'Population'),
266
+ ('sample_size', 'Sample Size'),
267
+ ('research_design', 'Design')
268
+ ]:
269
+ if field in study and study[field] and str(study[field]).lower() not in ['unknown', 'nan', '']:
270
+ studies_context += f" | {label}: {study[field]}"
271
+
272
+ studies_context += "\n"
273
+
274
+ # Add methodology information
275
+ method_info = []
276
+ for field, label in [
277
+ ('data_collection_method', 'Data Collection'),
278
+ ('analysis_type', 'Analysis'),
279
+ ('primary_data_techniques', 'Primary Methods'),
280
+ ('data_analysis_methods', 'Analysis Methods'),
281
+ ('research_period', 'Period')
282
+ ]:
283
+ if field in study and study[field] and str(study[field]).lower() not in ['unknown', 'nan', '']:
284
+ method_info.append(f"{label}: {study[field]}")
285
+
286
+ if method_info:
287
+ studies_context += f"Methodology: {' | '.join(method_info)}\n"
288
+
289
+ # Add quality indicators
290
+ quality_info = []
291
+ for field, label in [
292
+ ('rigor_score', 'Rigor Score'),
293
+ ('methodological_sophistication', 'Sophistication'),
294
+ ('has_validation', 'Validation'),
295
+ ('has_randomization', 'RCT'),
296
+ ('has_mixed_methods', 'Mixed Methods')
297
+ ]:
298
+ if field in study and study[field] and str(study[field]).lower() not in ['unknown', 'nan', 'false', '']:
299
+ if field.startswith('has_') and str(study[field]).lower() == 'true':
300
+ quality_info.append(label)
301
+ elif not field.startswith('has_'):
302
+ quality_info.append(f"{label}: {study[field]}")
303
+
304
+ if quality_info:
305
+ studies_context += f"Quality: {' | '.join(quality_info)}\n"
306
+
307
+ # Add content (truncated)
308
+ studies_context += f"Content: {study['combined_text'][:1000]}...\n"
309
+ studies_context += "-" * 80 + "\n"
310
+
311
+ synthesis_prompt = f"""You are an expert research synthesizer analyzing studies from fragile and conflict-affected settings (FCAS).
312
+
313
+ USER QUERY: "{query}"
314
+
315
+ FOCUS AREA: {query_analysis['focus_area']}
316
+
317
+ STUDIES TO SYNTHESIZE:
318
+ {studies_context}
319
+
320
+ SYNTHESIS INSTRUCTIONS:
321
+ 1. **Direct Answer**: Start with a clear, direct answer to the user's question
322
+ 2. **Evidence-Based**: Ground all claims in the provided studies with specific citations
323
+ 3. **Methodology Focus**: When relevant, detail research methods, sample sizes, and study designs
324
+ 4. **Geographic Context**: Highlight geographic patterns and country-specific findings
325
+ 5. **Quality Assessment**: Note study quality indicators (RCTs, sample sizes, rigor scores)
326
+ 6. **Practical Insights**: Extract actionable findings and recommendations
327
+ 7. **Knowledge Gaps**: Identify areas where more research is needed
328
+
329
+ FORMAT:
330
+ - Use clear section headers
331
+ - Include specific study citations: (Author, Year)
332
+ - Highlight key statistics and findings
333
+ - Note methodological strengths and limitations
334
+ - Provide geographic context where relevant
335
+
336
+ Write a comprehensive synthesis that directly addresses the user's question while showcasing the depth and breadth of evidence from these {len(studies)} studies."""
337
+
338
+ try:
339
+ model = genai.GenerativeModel("gemini-1.5-flash")
340
+ response = model.generate_content(synthesis_prompt)
341
+ return response.text
342
+ except Exception as e:
343
+ return f"Error creating synthesis: {e}"
344
+
345
+ def analyze_query(self, query: str) -> Dict[str, str]:
346
+ """Analyze the query to determine focus area"""
347
+ query_lower = query.lower()
348
+
349
+ focus_area = "general findings"
350
+ if any(word in query_lower for word in ['method', 'approach', 'methodology', 'technique']):
351
+ focus_area = "methodological approaches"
352
+ elif any(word in query_lower for word in ['result', 'finding', 'outcome', 'impact', 'effect']):
353
+ focus_area = "key findings and outcomes"
354
+ elif any(word in query_lower for word in ['challenge', 'barrier', 'problem', 'issue']):
355
+ focus_area = "challenges and barriers"
356
+ elif any(word in query_lower for word in ['recommendation', 'solution', 'strategy', 'intervention']):
357
+ focus_area = "strategies and recommendations"
358
+
359
+ return {
360
+ 'focus_area': focus_area,
361
+ 'original_query': query
362
+ }
363
+
364
+ def format_references(self, studies: List[Dict]) -> str:
365
+ """Format academic-style references"""
366
+ references = []
367
+
368
+ for i, study in enumerate(studies, 1):
369
+ title = study.get('title', 'Unknown Title')
370
+ authors = study.get('authors', 'Unknown Authors')
371
+ year = study.get('publication_year', study.get('research_year', 'Unknown'))
372
+ countries = study.get('study_countries', '')
373
+
374
+ # Format: [1] Authors (Year). Title. Countries: X. Relevance: 0.XX
375
+ ref = f"[{i}] {authors} ({year}). {title}"
376
+
377
+ if countries:
378
+ ref += f" *Countries: {countries}*"
379
+
380
+ if study.get('max_relevance'):
381
+ ref += f" *Relevance: {study['max_relevance']:.3f}*"
382
+
383
+ references.append(ref)
384
+
385
+ return "\n\n".join(references)
386
+
387
+ def answer_research_question(self, query: str,
388
+ min_studies: int = 3,
389
+ max_studies: int = 8) -> Dict[str, str]:
390
+ """Main method to answer research questions"""
391
+ print(f"πŸ” Processing query: '{query}'")
392
+
393
+ # Search for relevant chunks
394
+ relevant_chunks = self.search_relevant_chunks(query, top_k=25)
395
+
396
+ if not relevant_chunks:
397
+ return {
398
+ 'answer': "No relevant studies found. This might be due to dimension mismatch or API issues.",
399
+ 'references': "",
400
+ 'study_count': 0
401
+ }
402
+
403
+ # Group by studies
404
+ studies_dict = self.group_by_studies(relevant_chunks)
405
+ print(f"πŸ“š Found {len(studies_dict)} unique studies")
406
+
407
+ # Filter and rank studies with more lenient threshold
408
+ top_studies = self.filter_and_rank_studies(
409
+ studies_dict,
410
+ query=query,
411
+ max_studies=max_studies,
412
+ min_relevance=0.5 # More lenient threshold
413
+ )
414
+
415
+ if len(top_studies) < min_studies:
416
+ # If still too few, try with even lower threshold
417
+ top_studies = self.filter_and_rank_studies(
418
+ studies_dict,
419
+ query=query,
420
+ max_studies=max_studies,
421
+ min_relevance=0.3
422
+ )
423
+
424
+ if len(top_studies) == 0:
425
+ return {
426
+ 'answer': f"Found {len(studies_dict)} studies but none met relevance criteria. Try broader search terms.",
427
+ 'references': "",
428
+ 'study_count': len(studies_dict)
429
+ }
430
+
431
+ print(f"πŸ“š Synthesizing findings from {len(top_studies)} studies")
432
+
433
+ # Create synthesis
434
+ synthesis = self.create_synthesis(query, top_studies)
435
+ references = self.format_references(top_studies)
436
+
437
+ return {
438
+ 'answer': synthesis,
439
+ 'references': references,
440
+ 'study_count': len(top_studies)
441
+ }
442
+
443
+
444
+ def main():
445
+ """Test the system"""
446
+ import os
447
+ api_key = os.environ.get("GOOGLE_API_KEY", "your_api_key_here")
448
+
449
+ synthesizer = ResearchSynthesizer(
450
+ index_path="research_chunks.faiss",
451
+ metadata_path="chunk_metadata.csv",
452
+ api_key=api_key
453
+ )
454
+
455
+ test_queries = [
456
+ "agricultural research methods",
457
+ "cash transfer programs",
458
+ "education in fragile states",
459
+ "health interventions"
460
+ ]
461
+
462
+ for query in test_queries:
463
+ print("\n" + "="*80)
464
+ print(f"QUERY: {query}")
465
+ print("="*80)
466
+
467
+ result = synthesizer.answer_research_question(query)
468
+ print(f"Studies found: {result['study_count']}")
469
+ print(result['answer'])
470
+ print(result['references'])
471
+ print(f"\nπŸ“Š Synthesized from {result['study_count']} studies")
472
+
473
+ if __name__ == "__main__":
474
+ main()
475
+
visualisations.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.express as px
3
+ import plotly.graph_objects as go
4
+ from plotly.subplots import make_subplots
5
+ from collections import Counter
6
+ import numpy as np
7
+
8
+ def create_world_map(docs_df):
9
+ """Create interactive world map showing study distribution for conflict-affected countries only"""
10
+ if docs_df.empty or 'study_countries' not in docs_df.columns:
11
+ print("No data or missing 'study_countries' column")
12
+ return None
13
+
14
+ # Define the specific countries we want to show with their study counts
15
+ target_countries = {
16
+ # Nationwide conflict
17
+ 'Burkina Faso': 1098,
18
+ 'Afghanistan': 697,
19
+ 'Mali': 496,
20
+ 'Sudan': 470,
21
+ 'Haiti': 394,
22
+ 'Somalia': 373,
23
+ 'Niger': 352,
24
+ 'Syria': 323,
25
+ 'South Sudan': 294,
26
+ 'Libya': 119,
27
+ 'Palestinian Territories': 81,
28
+ 'Central African Republic': 72,
29
+ # Partial conflict
30
+ 'Iraq': 128,
31
+ 'Nigeria': 121,
32
+ 'Lebanon': 102,
33
+ 'Ethiopia': 81,
34
+ 'Democratic Republic of the Congo': 71,
35
+ 'Cameroon': 54,
36
+ 'Chad': 36,
37
+ 'Mozambique': 30,
38
+ 'Myanmar': 11
39
+ }
40
+
41
+ # Count actual studies in our dataset for these countries
42
+ country_counts = Counter()
43
+
44
+ for countries_str in docs_df['study_countries'].dropna():
45
+ if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
46
+ continue
47
+
48
+ countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
49
+ for country in countries:
50
+ if country in target_countries:
51
+ country_counts[country] += 1
52
+
53
+ # Use target countries with actual counts where available
54
+ map_data = []
55
+ for country, target_count in target_countries.items():
56
+ actual_count = country_counts.get(country, 0)
57
+ conflict_type = "Nationwide" if target_count > 400 else "Partial"
58
+ map_data.append({
59
+ 'country': country,
60
+ 'actual_studies': actual_count,
61
+ 'target_studies': target_count,
62
+ 'conflict_type': conflict_type
63
+ })
64
+
65
+ map_df = pd.DataFrame(map_data)
66
+
67
+ print(f"Mapping {len(map_df)} conflict-affected countries")
68
+ print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}")
69
+
70
+ # Create choropleth map using target study counts
71
+ fig = go.Figure(data=go.Choropleth(
72
+ locations=map_df['country'],
73
+ z=map_df['target_studies'],
74
+ locationmode='country names',
75
+ colorscale='Reds',
76
+ hovertemplate='<b>%{location}</b><br>' +
77
+ 'Studies (Target): %{z}<br>' +
78
+ 'Studies (In Dataset): %{customdata}<br>' +
79
+ '<extra></extra>',
80
+ customdata=map_df['actual_studies'],
81
+ colorbar_title="Number of Studies"
82
+ ))
83
+
84
+ fig.update_layout(
85
+ title={
86
+ 'text': 'Research Coverage: Conflict-Affected Countries',
87
+ 'x': 0.5,
88
+ 'xanchor': 'center',
89
+ 'font': {'size': 18}
90
+ },
91
+ geo=dict(
92
+ showframe=False,
93
+ showcoastlines=True,
94
+ projection_type='natural earth'
95
+ ),
96
+ height=600,
97
+ width=1000
98
+ )
99
+
100
+ fig.show()
101
+ return fig
102
+
103
+ def create_interactive_data_explorer(docs_df):
104
+ """Create an interactive data explorer for methodology analysis"""
105
+ if docs_df.empty:
106
+ print("No data available")
107
+ return None
108
+
109
+ print("=== DATASET OVERVIEW ===")
110
+ print(f"Total studies: {len(docs_df)}")
111
+ print(f"Columns available: {len(docs_df.columns)}")
112
+
113
+ # Key numeric columns for analysis
114
+ numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number']
115
+ categorical_cols = [
116
+ 'world_bank_sector', 'research_design', 'data_collection_method',
117
+ 'analysis_type', 'study_countries', 'population', 'author_income_group',
118
+ 'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis'
119
+ ]
120
+
121
+ # Filter to existing columns
122
+ available_numeric = [col for col in numeric_cols if col in docs_df.columns]
123
+ available_categorical = [col for col in categorical_cols if col in docs_df.columns]
124
+
125
+ print(f"Numeric variables: {available_numeric}")
126
+ print(f"Categorical variables: {available_categorical}")
127
+
128
+ # Create summary statistics table
129
+ summary_data = []
130
+
131
+ # Numeric summaries
132
+ for col in available_numeric:
133
+ values = pd.to_numeric(docs_df[col], errors='coerce').dropna()
134
+ if len(values) > 0:
135
+ summary_data.append({
136
+ 'Variable': col,
137
+ 'Type': 'Numeric',
138
+ 'Valid_Values': len(values),
139
+ 'Missing': len(docs_df) - len(values),
140
+ 'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}"
141
+ })
142
+
143
+ # Categorical summaries
144
+ for col in available_categorical:
145
+ values = docs_df[col].dropna()
146
+ if len(values) > 0:
147
+ unique_count = values.nunique()
148
+ top_category = values.value_counts().index[0] if len(values) > 0 else "None"
149
+ summary_data.append({
150
+ 'Variable': col,
151
+ 'Type': 'Categorical',
152
+ 'Valid_Values': len(values),
153
+ 'Missing': len(docs_df) - len(values),
154
+ 'Summary': f"{unique_count} categories, Top: {top_category}"
155
+ })
156
+
157
+ summary_df = pd.DataFrame(summary_data)
158
+
159
+ # Create visualization showing data completeness
160
+ fig = go.Figure()
161
+
162
+ # Data completeness bar chart
163
+ fig.add_trace(go.Bar(
164
+ x=summary_df['Variable'],
165
+ y=summary_df['Valid_Values'],
166
+ name='Valid Values',
167
+ marker_color='steelblue',
168
+ hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>',
169
+ customdata=summary_df['Summary']
170
+ ))
171
+
172
+ fig.add_trace(go.Bar(
173
+ x=summary_df['Variable'],
174
+ y=summary_df['Missing'],
175
+ name='Missing Values',
176
+ marker_color='lightcoral'
177
+ ))
178
+
179
+ fig.update_layout(
180
+ title='Data Completeness by Variable',
181
+ xaxis_title='Variables',
182
+ yaxis_title='Number of Records',
183
+ barmode='stack',
184
+ height=500,
185
+ xaxis={'tickangle': 45}
186
+ )
187
+
188
+ fig.show()
189
+
190
+ # Print summary table
191
+ print("\n=== VARIABLE SUMMARY ===")
192
+ for _, row in summary_df.iterrows():
193
+ print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}")
194
+
195
+ return fig, summary_df
196
+
197
+ def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'):
198
+ """Create a pivot table analysis with visualization"""
199
+ if docs_df.empty:
200
+ return None
201
+
202
+ if row_var not in docs_df.columns or col_var not in docs_df.columns:
203
+ print(f"Variables not found. Available: {list(docs_df.columns)}")
204
+ return None
205
+
206
+ try:
207
+ if value_var and value_var in docs_df.columns:
208
+ # Numeric aggregation
209
+ pivot_df = docs_df.pivot_table(
210
+ index=row_var,
211
+ columns=col_var,
212
+ values=value_var,
213
+ aggfunc=agg_func,
214
+ fill_value=0
215
+ )
216
+ title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}"
217
+ else:
218
+ # Count aggregation
219
+ pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var])
220
+ title = f"Study Count by {row_var} and {col_var}"
221
+
222
+ # Create heatmap
223
+ fig = px.imshow(
224
+ pivot_df.values,
225
+ x=pivot_df.columns,
226
+ y=pivot_df.index,
227
+ color_continuous_scale='Viridis',
228
+ title=title
229
+ )
230
+
231
+ fig.update_layout(
232
+ height=max(400, len(pivot_df.index) * 30),
233
+ width=max(600, len(pivot_df.columns) * 50)
234
+ )
235
+
236
+ fig.show()
237
+
238
+ print(f"\nPivot Table: {row_var} Γ— {col_var}")
239
+ print(pivot_df.head(10))
240
+
241
+ return fig, pivot_df
242
+
243
+ except Exception as e:
244
+ print(f"Error creating pivot: {e}")
245
+ return None
246
+
247
+ # Example usage functions
248
+ def explore_methodology_patterns(docs_df):
249
+ """Explore common methodology patterns"""
250
+ if docs_df.empty:
251
+ return None
252
+
253
+ # Research design by sector
254
+ if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns:
255
+ print("=== RESEARCH DESIGN BY SECTOR ===")
256
+ return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design')
257
+
258
+ def explore_data_collection(docs_df):
259
+ """Explore data collection patterns"""
260
+ if docs_df.empty:
261
+ return None
262
+
263
+ # Data collection by country income group
264
+ if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns:
265
+ print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===")
266
+ return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method')
267
+
268
+ def filter_and_analyze(docs_df, **filters):
269
+ """Filter data and run analysis on the subset"""
270
+ if docs_df.empty:
271
+ print("No data available")
272
+ return None
273
+
274
+ filtered = docs_df.copy()
275
+ filter_summary = []
276
+
277
+ # Apply filters
278
+ if 'countries' in filters and filters['countries']:
279
+ countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']]
280
+ country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
281
+ filtered = filtered[country_mask]
282
+ filter_summary.append(f"Countries: {', '.join(countries)}")
283
+
284
+ if 'sectors' in filters and filters['sectors']:
285
+ sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']]
286
+ sector_mask = filtered['world_bank_sector'].isin(sectors)
287
+ filtered = filtered[sector_mask]
288
+ filter_summary.append(f"Sectors: {', '.join(sectors)}")
289
+
290
+ if 'min_year' in filters and filters['min_year']:
291
+ year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
292
+ filtered = filtered[year_col >= filters['min_year']]
293
+ filter_summary.append(f"Year >= {filters['min_year']}")
294
+
295
+ if 'max_year' in filters and filters['max_year']:
296
+ year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
297
+ filtered = filtered[year_col <= filters['max_year']]
298
+ filter_summary.append(f"Year <= {filters['max_year']}")
299
+
300
+ if 'has_rct' in filters and filters['has_rct']:
301
+ filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])]
302
+ filter_summary.append("RCT studies only")
303
+
304
+ if 'min_sample_size' in filters and filters['min_sample_size']:
305
+ sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce')
306
+ filtered = filtered[sample_col >= filters['min_sample_size']]
307
+ filter_summary.append(f"Sample size >= {filters['min_sample_size']}")
308
+
309
+ # Show results
310
+ print(f"=== FILTERED ANALYSIS ===")
311
+ print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}")
312
+ print(f"Studies found: {len(filtered)}/{len(docs_df)}")
313
+
314
+ if filtered.empty:
315
+ print("No studies match the criteria.")
316
+ return None
317
+
318
+ # Quick analysis of filtered data
319
+ if len(filtered) > 5:
320
+ # Show key distributions
321
+ if 'world_bank_sector' in filtered.columns:
322
+ print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}")
323
+ if 'research_design' in filtered.columns:
324
+ print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}")
325
+ if 'rigor_score' in filtered.columns:
326
+ rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna()
327
+ if len(rigor_scores) > 0:
328
+ print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}")
329
+
330
+ return filtered
331
+
332
+ # Quick start function
333
+ def quick_analysis(docs_df):
334
+ """Run a quick analysis of the dataset"""
335
+ print("Starting comprehensive data analysis...")
336
+
337
+ # 1. Data overview
338
+ explorer_fig, summary_df = create_interactive_data_explorer(docs_df)
339
+
340
+ # 2. Map
341
+ map_fig = create_world_map(docs_df)
342
+
343
+ # 3. Sample pivot analyses
344
+ if len(docs_df) > 0:
345
+ explore_methodology_patterns(docs_df)
346
+ explore_data_collection(docs_df)
347
+
348
+ return explorer_fig, map_fig, summary_df
visualisations_old.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.express as px
3
+ import plotly.graph_objects as go
4
+ from plotly.subplots import make_subplots
5
+ from collections import Counter
6
+
7
+ def create_world_map(docs_df):
8
+ """Create interactive world map showing study distribution"""
9
+ if docs_df.empty:
10
+ return None
11
+
12
+ # Count studies by country
13
+ country_counts = Counter()
14
+ for countries_str in docs_df['study_countries'].dropna():
15
+ if isinstance(countries_str, str) and countries_str.lower() != 'nan':
16
+ # Split multiple countries
17
+ countries = [c.strip() for c in countries_str.split(',')]
18
+ for country in countries:
19
+ country_counts[country] += 1
20
+
21
+ if not country_counts:
22
+ return None
23
+
24
+ # Create choropleth map
25
+ countries = list(country_counts.keys())
26
+ counts = list(country_counts.values())
27
+
28
+ fig = go.Figure(data=go.Choropleth(
29
+ locations=countries,
30
+ z=counts,
31
+ locationmode='country names',
32
+ colorscale='Viridis',
33
+ text=countries,
34
+ hovertemplate='<b>%{text}</b><br>Studies: %{z}<extra></extra>',
35
+ colorbar_title="Number of Studies"
36
+ ))
37
+
38
+ fig.update_layout(
39
+ title={
40
+ 'text': '🌍 Global Research Coverage',
41
+ 'x': 0.5,
42
+ 'font': {'size': 20}
43
+ },
44
+ geo=dict(
45
+ showframe=False,
46
+ showcoastlines=True,
47
+ projection_type='equirectangular'
48
+ ),
49
+ height=500
50
+ )
51
+
52
+ return fig
53
+
54
+ def create_sector_analysis(docs_df):
55
+ """Create sector distribution charts"""
56
+ if docs_df.empty:
57
+ return None, None
58
+
59
+ # Sector distribution
60
+ sector_counts = docs_df['world_bank_sector'].value_counts().head(10)
61
+
62
+ fig1 = px.bar(
63
+ x=sector_counts.values,
64
+ y=sector_counts.index,
65
+ orientation='h',
66
+ title="πŸ“Š Research by World Bank Sector",
67
+ labels={'x': 'Number of Studies', 'y': 'Sector'},
68
+ color=sector_counts.values,
69
+ color_continuous_scale='viridis'
70
+ )
71
+ fig1.update_layout(height=400, showlegend=False)
72
+
73
+ # Research design pie chart
74
+ design_counts = docs_df['research_design'].value_counts().head(8)
75
+
76
+ fig2 = px.pie(
77
+ values=design_counts.values,
78
+ names=design_counts.index,
79
+ title="πŸ”¬ Research Design Distribution",
80
+ color_discrete_sequence=px.colors.qualitative.Set3
81
+ )
82
+ fig2.update_traces(textposition='inside', textinfo='percent+label')
83
+ fig2.update_layout(height=400)
84
+
85
+ return fig1, fig2
86
+
87
+ def create_methodology_dashboard(docs_df):
88
+ """Create methodology analysis dashboard"""
89
+ if docs_df.empty:
90
+ return None
91
+
92
+ # Create subplot figure
93
+ fig = make_subplots(
94
+ rows=2, cols=2,
95
+ subplot_titles=('Sample Size Distribution', 'Rigor Scores',
96
+ 'Data Collection Methods', 'Quality Indicators'),
97
+ specs=[[{"secondary_y": False}, {"secondary_y": False}],
98
+ [{"secondary_y": False}, {"secondary_y": False}]]
99
+ )
100
+
101
+ # Sample size histogram
102
+ sample_sizes = pd.to_numeric(docs_df['sample_size'], errors='coerce').dropna()
103
+ if not sample_sizes.empty:
104
+ fig.add_trace(
105
+ go.Histogram(x=sample_sizes, name="Sample Size", nbinsx=20),
106
+ row=1, col=1
107
+ )
108
+
109
+ # Rigor scores
110
+ rigor_scores = pd.to_numeric(docs_df['rigor_score'], errors='coerce').dropna()
111
+ if not rigor_scores.empty:
112
+ fig.add_trace(
113
+ go.Histogram(x=rigor_scores, name="Rigor Score", nbinsx=10),
114
+ row=1, col=2
115
+ )
116
+
117
+ # Data collection methods
118
+ data_methods = docs_df['data_collection_method'].value_counts().head(8)
119
+ if not data_methods.empty:
120
+ fig.add_trace(
121
+ go.Bar(x=data_methods.values, y=data_methods.index,
122
+ orientation='h', name="Data Methods"),
123
+ row=2, col=1
124
+ )
125
+
126
+ # Quality indicators (RCT, Validation, etc.)
127
+ quality_data = []
128
+ for col in ['has_randomization', 'has_validation', 'has_mixed_methods']:
129
+ if col in docs_df.columns:
130
+ true_count = (docs_df[col] == 'true').sum()
131
+ quality_data.append((col.replace('has_', '').title(), true_count))
132
+
133
+ if quality_data:
134
+ labels, values = zip(*quality_data)
135
+ fig.add_trace(
136
+ go.Bar(x=list(labels), y=list(values), name="Quality Features"),
137
+ row=2, col=2
138
+ )
139
+
140
+ fig.update_layout(
141
+ height=800,
142
+ title_text="πŸ“ˆ Methodology Dashboard",
143
+ title_x=0.5,
144
+ showlegend=False
145
+ )
146
+
147
+ return fig
148
+
149
+ def filter_studies(docs_df, countries, sectors, min_year, max_year, has_rct, min_sample_size):
150
+ """Filter and display studies based on criteria"""
151
+ if docs_df.empty:
152
+ return "No data available"
153
+
154
+ filtered = docs_df.copy()
155
+
156
+ # Apply filters
157
+ if countries:
158
+ country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
159
+ filtered = filtered[country_mask]
160
+
161
+ if sectors:
162
+ sector_mask = filtered['world_bank_sector'].isin(sectors)
163
+ filtered = filtered[sector_mask]
164
+
165
+ if min_year:
166
+ year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
167
+ filtered = filtered[year_col >= min_year]
168
+
169
+ if max_year:
170
+ year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
171
+ filtered = filtered[year_col <= max_year]
172
+
173
+ if has_rct:
174
+ filtered = filtered[filtered['has_randomization'] == 'true']
175
+
176
+ if min_sample_size:
177
+ sample_col = pd.to_numeric(filtered['sample_size'], errors='coerce')
178
+ filtered = filtered[sample_col >= min_sample_size]
179
+
180
+ # Display results
181
+ if filtered.empty:
182
+ return "No studies match the selected criteria."
183
+
184
+ # Create summary
185
+ result = f"## πŸ” Filtered Results: {len(filtered)} studies\n\n"
186
+
187
+ # Show sample of results
188
+ display_cols = ['title', 'authors', 'publication_year', 'study_countries',
189
+ 'world_bank_sector', 'research_design', 'sample_size']
190
+ available_cols = [col for col in display_cols if col in filtered.columns]
191
+
192
+ sample_df = filtered[available_cols].head(10)
193
+ result += sample_df.to_markdown(index=False)
194
+
195
+ if len(filtered) > 10:
196
+ result += f"\n\n*... and {len(filtered) - 10} more studies*"
197
+
198
+ return result
199
+
200
+ def get_unique_values(docs_df):
201
+ """Extract unique countries and sectors for dropdowns"""
202
+ countries_list = []
203
+ sectors_list = []
204
+
205
+ if not docs_df.empty:
206
+ # Extract unique countries
207
+ for countries_str in docs_df['study_countries'].dropna():
208
+ if isinstance(countries_str, str) and countries_str.lower() != 'nan':
209
+ countries = [c.strip() for c in countries_str.split(',')]
210
+ countries_list.extend(countries)
211
+ countries_list = sorted(list(set(countries_list)))
212
+
213
+ # Extract unique sectors
214
+ sectors_list = sorted(docs_df['world_bank_sector'].dropna().unique().tolist())
215
+
216
+ return countries_list, sectors_list