Spaces:
Running
Running
added files
Browse files- Dockerfile +21 -0
- HUGGINGFACE_DEPLOYMENT.md +288 -0
- README.md +71 -6
- app.py +335 -0
- config.py +78 -0
- data/Biomedical-pubmedqa.csv +7 -0
- data/Finance-finqa.csv +7 -0
- data/General-msmarco.csv +7 -0
- data/Legal-cuad.csv +6 -0
- data_loader.py +160 -0
- requirements.txt +6 -0
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use the official Python 3.9 image
|
| 2 |
+
FROM python:3.9
|
| 3 |
+
|
| 4 |
+
# Set the working directory to /code
|
| 5 |
+
WORKDIR /code
|
| 6 |
+
|
| 7 |
+
# Copy the requirements file into the container at /code
|
| 8 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 9 |
+
|
| 10 |
+
# Install the dependencies
|
| 11 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy the rest of your application code
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Create a writable directory for cache/temporary files if needed (good practice)
|
| 17 |
+
RUN mkdir -p /code/cache && chmod 777 /code/cache
|
| 18 |
+
|
| 19 |
+
# Command to run the application
|
| 20 |
+
# We use host 0.0.0.0 and port 7860 (Hugging Face's default port)
|
| 21 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
HUGGINGFACE_DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Deploying RAG Analytics to Hugging Face Spaces
|
| 2 |
+
|
| 3 |
+
## 📦 Required Files (Keep These)
|
| 4 |
+
|
| 5 |
+
### Core Application Files:
|
| 6 |
+
- ✅ `app.py` - Main application
|
| 7 |
+
- ✅ `config.py` - Configuration settings
|
| 8 |
+
- ✅ `data_loader.py` - Data loading logic
|
| 9 |
+
- ✅ `requirements.txt` - Python dependencies
|
| 10 |
+
- ✅ `data/` folder with CSV files:
|
| 11 |
+
- `Biomedical-pubmedqa.csv`
|
| 12 |
+
- `Finance-finqa.csv`
|
| 13 |
+
- `General-msmarco.csv`
|
| 14 |
+
- `Legal-cuad.csv`
|
| 15 |
+
|
| 16 |
+
### Optional Files:
|
| 17 |
+
- ✅ `README.md` - Documentation (recommended)
|
| 18 |
+
- ✅ `.gitattributes` - Git settings (auto-generated)
|
| 19 |
+
|
| 20 |
+
## 🗑️ Files to DELETE (Not Needed for Deployment)
|
| 21 |
+
|
| 22 |
+
These are just test/debug files:
|
| 23 |
+
- ❌ `debug_plot.py`
|
| 24 |
+
- ❌ `test_data.py`
|
| 25 |
+
- ❌ `test_fix.py`
|
| 26 |
+
- ❌ `show_expected_data.py`
|
| 27 |
+
- ❌ `RESTART_INSTRUCTIONS.md`
|
| 28 |
+
- ❌ `Dockerfile` (unless you specifically want Docker support)
|
| 29 |
+
- ❌ `__pycache__/` (auto-generated, will be ignored)
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## 🎯 Deployment Steps
|
| 34 |
+
|
| 35 |
+
### Step 1: Create Hugging Face Space
|
| 36 |
+
|
| 37 |
+
1. Go to https://huggingface.co/spaces
|
| 38 |
+
2. Click **"Create new Space"**
|
| 39 |
+
3. Configure:
|
| 40 |
+
- **Owner:** Your username/organization
|
| 41 |
+
- **Space name:** `rag-analytics-dashboard` (or your choice)
|
| 42 |
+
- **License:** Apache 2.0 (recommended)
|
| 43 |
+
- **Select SDK:** **Gradio**
|
| 44 |
+
- **Space hardware:** CPU basic (free tier is fine)
|
| 45 |
+
- **Visibility:** Public or Private (your choice)
|
| 46 |
+
4. Click **"Create Space"**
|
| 47 |
+
|
| 48 |
+
### Step 2: Upload Files to Space
|
| 49 |
+
|
| 50 |
+
#### Option A: Using Git (Recommended)
|
| 51 |
+
```bash
|
| 52 |
+
# Clone your new space
|
| 53 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/rag-analytics-dashboard
|
| 54 |
+
cd rag-analytics-dashboard
|
| 55 |
+
|
| 56 |
+
# Copy your files (from rag12-analytics folder)
|
| 57 |
+
copy app.py .
|
| 58 |
+
copy config.py .
|
| 59 |
+
copy data_loader.py .
|
| 60 |
+
copy requirements.txt .
|
| 61 |
+
copy README.md .
|
| 62 |
+
|
| 63 |
+
# Copy data folder
|
| 64 |
+
xcopy /E /I data data
|
| 65 |
+
|
| 66 |
+
# Commit and push
|
| 67 |
+
git add .
|
| 68 |
+
git commit -m "Initial deployment"
|
| 69 |
+
git push
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
#### Option B: Using Web UI (Easier)
|
| 73 |
+
1. In your Space page, click **"Files"** tab
|
| 74 |
+
2. Click **"Add file"** → **"Upload files"**
|
| 75 |
+
3. Upload these files one by one:
|
| 76 |
+
- `app.py`
|
| 77 |
+
- `config.py`
|
| 78 |
+
- `data_loader.py`
|
| 79 |
+
- `requirements.txt`
|
| 80 |
+
4. Create `data` folder:
|
| 81 |
+
- Click **"Add file"** → **"Create a new file"**
|
| 82 |
+
- Name it `data/.gitkeep` (this creates the folder)
|
| 83 |
+
- Click "Commit"
|
| 84 |
+
5. Upload CSV files to `data/` folder:
|
| 85 |
+
- Click on `data` folder
|
| 86 |
+
- Click **"Add file"** → **"Upload files"**
|
| 87 |
+
- Upload all 4 CSV files
|
| 88 |
+
|
| 89 |
+
### Step 3: Update requirements.txt
|
| 90 |
+
|
| 91 |
+
Make sure your `requirements.txt` contains:
|
| 92 |
+
```
|
| 93 |
+
gradio>=4.0.0
|
| 94 |
+
plotly>=5.18.0
|
| 95 |
+
pandas>=2.0.0
|
| 96 |
+
fastapi
|
| 97 |
+
uvicorn
|
| 98 |
+
python-multipart
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
**Delete these lines if present:**
|
| 102 |
+
- `huggingface-hub<1.0.0` (causes conflicts)
|
| 103 |
+
- Any pydantic version restrictions
|
| 104 |
+
|
| 105 |
+
### Step 4: Verify Deployment
|
| 106 |
+
|
| 107 |
+
1. After upload, Hugging Face will automatically:
|
| 108 |
+
- Install dependencies from `requirements.txt`
|
| 109 |
+
- Run `app.py`
|
| 110 |
+
- Build the Gradio interface
|
| 111 |
+
|
| 112 |
+
2. Wait for build to complete (1-3 minutes)
|
| 113 |
+
- You'll see logs in the **"Logs"** tab
|
| 114 |
+
- Look for: "Running on local URL: http://0.0.0.0:7860"
|
| 115 |
+
|
| 116 |
+
3. Your app will be live at:
|
| 117 |
+
```
|
| 118 |
+
https://huggingface.co/spaces/YOUR_USERNAME/rag-analytics-dashboard
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
---
|
| 122 |
+
|
| 123 |
+
## ⚙️ Configuration for Hugging Face
|
| 124 |
+
|
| 125 |
+
### Update config.py (if needed)
|
| 126 |
+
|
| 127 |
+
The app is already configured to use `./data` folder by default, which works on HF Spaces:
|
| 128 |
+
|
| 129 |
+
```python
|
| 130 |
+
DATA_FOLDER = os.environ.get("DATA_FOLDER", "./data")
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
No changes needed! ✅
|
| 134 |
+
|
| 135 |
+
### Environment Variables (Optional)
|
| 136 |
+
|
| 137 |
+
If you want to change the data folder location:
|
| 138 |
+
1. Go to Space **Settings** → **Variables and secrets**
|
| 139 |
+
2. Add: `DATA_FOLDER` = `/path/to/data`
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## ✅ Verification Checklist
|
| 144 |
+
|
| 145 |
+
After deployment, check:
|
| 146 |
+
|
| 147 |
+
1. **App loads successfully**
|
| 148 |
+
- Visit your Space URL
|
| 149 |
+
- Should see "RAG Pipeline Analytics" header
|
| 150 |
+
- Should show "Version: v2.1.0-fixed"
|
| 151 |
+
|
| 152 |
+
2. **Data loads automatically**
|
| 153 |
+
- Status box should show: "Successfully loaded 23 test runs from 4 file(s)"
|
| 154 |
+
- Should list all 4 CSV files
|
| 155 |
+
|
| 156 |
+
3. **Dropdowns populate**
|
| 157 |
+
- Domain dropdown should show: msmarco, pubmedqa, finqa, cuad
|
| 158 |
+
|
| 159 |
+
4. **Graphs display correctly**
|
| 160 |
+
- Select a domain
|
| 161 |
+
- RMSE graph should show values like 0.325, 0.200, 0.436
|
| 162 |
+
- Performance graph should show values like 0.595, 0.513
|
| 163 |
+
- NOT showing 0.000, 1.000, 2.000
|
| 164 |
+
|
| 165 |
+
5. **Inter-domain comparison works**
|
| 166 |
+
- Click "Generate Comparison" button
|
| 167 |
+
- Table should show configuration differences
|
| 168 |
+
- Bar chart should show different F1 scores per domain
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
## 🐛 Troubleshooting
|
| 173 |
+
|
| 174 |
+
### Build Fails
|
| 175 |
+
- Check **Logs** tab for errors
|
| 176 |
+
- Common issues:
|
| 177 |
+
- Missing dependencies → Add to `requirements.txt`
|
| 178 |
+
- Incompatible versions → Use version ranges (>=) not exact (==)
|
| 179 |
+
|
| 180 |
+
### App Loads but No Data
|
| 181 |
+
- Verify CSV files are in `data/` folder
|
| 182 |
+
- Check file names match exactly:
|
| 183 |
+
- `Biomedical-pubmedqa.csv`
|
| 184 |
+
- `Finance-finqa.csv`
|
| 185 |
+
- `General-msmarco.csv`
|
| 186 |
+
- `Legal-cuad.csv`
|
| 187 |
+
|
| 188 |
+
### Graphs Show Wrong Values
|
| 189 |
+
- This was the local issue - should NOT happen on HF Spaces
|
| 190 |
+
- HF Spaces runs fresh code every time
|
| 191 |
+
- If it happens, restart the Space: Settings → Factory reboot
|
| 192 |
+
|
| 193 |
+
### Out of Memory
|
| 194 |
+
- Upgrade to better hardware tier (Settings → Change hardware)
|
| 195 |
+
- Or reduce data size
|
| 196 |
+
|
| 197 |
+
---
|
| 198 |
+
|
| 199 |
+
## 🎨 Customization (Optional)
|
| 200 |
+
|
| 201 |
+
### Update README.md
|
| 202 |
+
Create a nice README for your Space visitors:
|
| 203 |
+
|
| 204 |
+
```markdown
|
| 205 |
+
---
|
| 206 |
+
title: RAG Analytics Dashboard
|
| 207 |
+
emoji: 🧬
|
| 208 |
+
colorFrom: blue
|
| 209 |
+
colorTo: green
|
| 210 |
+
sdk: gradio
|
| 211 |
+
sdk_version: 4.0.0
|
| 212 |
+
app_file: app.py
|
| 213 |
+
pinned: false
|
| 214 |
+
---
|
| 215 |
+
|
| 216 |
+
# RAG Pipeline Analytics Dashboard
|
| 217 |
+
|
| 218 |
+
Analyzes RAG (Retrieval-Augmented Generation) system performance across multiple domains.
|
| 219 |
+
|
| 220 |
+
## Features
|
| 221 |
+
- Intra-domain analysis with filtering
|
| 222 |
+
- Inter-domain comparison
|
| 223 |
+
- Interactive visualizations
|
| 224 |
+
- Supports multiple domains: Biomedical, Finance, Legal, General
|
| 225 |
+
|
| 226 |
+
## Usage
|
| 227 |
+
1. Select a domain from the dropdown
|
| 228 |
+
2. Apply filters to compare specific configurations
|
| 229 |
+
3. View RMSE and performance metrics
|
| 230 |
+
4. Compare peak performance across domains
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
### Add Space Thumbnail
|
| 234 |
+
- Add `thumbnail.png` or `thumbnail.jpg` to root
|
| 235 |
+
- Recommended size: 1200x630 pixels
|
| 236 |
+
|
| 237 |
+
---
|
| 238 |
+
|
| 239 |
+
## 📱 Sharing Your Space
|
| 240 |
+
|
| 241 |
+
Once deployed, share with:
|
| 242 |
+
```
|
| 243 |
+
https://huggingface.co/spaces/YOUR_USERNAME/rag-analytics-dashboard
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
Or embed in websites:
|
| 247 |
+
```html
|
| 248 |
+
<iframe
|
| 249 |
+
src="https://YOUR_USERNAME-rag-analytics-dashboard.hf.space"
|
| 250 |
+
frameborder="0"
|
| 251 |
+
width="100%"
|
| 252 |
+
height="800"
|
| 253 |
+
></iframe>
|
| 254 |
+
```
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
## 🔄 Updating Your Space
|
| 259 |
+
|
| 260 |
+
To update after deployment:
|
| 261 |
+
|
| 262 |
+
### Via Git:
|
| 263 |
+
```bash
|
| 264 |
+
cd rag-analytics-dashboard
|
| 265 |
+
# Make changes to files
|
| 266 |
+
git add .
|
| 267 |
+
git commit -m "Update: description of changes"
|
| 268 |
+
git push
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
### Via Web:
|
| 272 |
+
1. Click on file to edit
|
| 273 |
+
2. Click pencil icon (Edit)
|
| 274 |
+
3. Make changes
|
| 275 |
+
4. Commit changes
|
| 276 |
+
|
| 277 |
+
Space will automatically rebuild and deploy! 🚀
|
| 278 |
+
|
| 279 |
+
---
|
| 280 |
+
|
| 281 |
+
## 💰 Cost
|
| 282 |
+
|
| 283 |
+
- **Free tier:** CPU basic (sufficient for this app)
|
| 284 |
+
- **Upgrade options:** If you need faster performance
|
| 285 |
+
- CPU upgrade: $0.03/hour
|
| 286 |
+
- GPU T4: $0.60/hour (overkill for this app)
|
| 287 |
+
|
| 288 |
+
For this analytics dashboard, **FREE tier is perfectly fine**! ✅
|
README.md
CHANGED
|
@@ -1,11 +1,76 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 🐠
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
-
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: RAG Analytics Dashboard
|
|
|
|
| 3 |
colorFrom: blue
|
| 4 |
+
colorTo: green
|
| 5 |
+
sdk: gradio
|
| 6 |
+
sdk_version: 4.0.0
|
| 7 |
+
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
short_description: Compare RAG system performance across multiple domains
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# RAG Pipeline Analytics Dashboard
|
| 14 |
+
|
| 15 |
+
Interactive dashboard for analyzing RAG (Retrieval-Augmented Generation) system performance across multiple domains.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **Intra-Domain Analysis:** Compare different RAG configurations within a single domain
|
| 20 |
+
- **Performance Metrics:** RMSE (Relevance, Utilization, Completeness), F1 Score, AUC-ROC
|
| 21 |
+
- **Interactive Filtering:** Filter tests by reranker model, summarization model, and chunking strategy
|
| 22 |
+
- **Inter-Domain Comparison:** Compare peak performance across different domains
|
| 23 |
+
- **Data Preview:** Inspect raw data and configuration parameters
|
| 24 |
+
|
| 25 |
+
## Supported Domains
|
| 26 |
+
|
| 27 |
+
- **Biomedical** (PubMedQA)
|
| 28 |
+
- **Finance** (FinQA)
|
| 29 |
+
- **General** (MS MARCO)
|
| 30 |
+
- **Legal** (CUAD)
|
| 31 |
+
|
| 32 |
+
## Usage
|
| 33 |
+
|
| 34 |
+
1. **Load Data:** Click "Load/Refresh Data" to load all test results
|
| 35 |
+
2. **Select Domain:** Choose a domain from the dropdown
|
| 36 |
+
3. **Apply Filters:** Use the filter dropdowns to compare specific configurations
|
| 37 |
+
4. **View Metrics:**
|
| 38 |
+
- RMSE graph shows relevance, utilization, and completeness (lower is better)
|
| 39 |
+
- Performance graph shows F1 Score and AUC-ROC (higher is better)
|
| 40 |
+
5. **Compare Domains:** Switch to "Inter-Domain Comparison" tab to see overall best configurations
|
| 41 |
+
|
| 42 |
+
## Interpreting Results
|
| 43 |
+
|
| 44 |
+
### RMSE Metrics (Lower is Better)
|
| 45 |
+
- **Relevance:** How well retrieved documents match the query
|
| 46 |
+
- **Utilization:** How efficiently the context is used
|
| 47 |
+
- **Completeness:** Coverage of required information
|
| 48 |
+
|
| 49 |
+
### Performance Metrics (Higher is Better)
|
| 50 |
+
- **F1 Score:** Balance of precision and recall
|
| 51 |
+
- **AUC-ROC:** Overall classification performance
|
| 52 |
+
|
| 53 |
+
## Configuration Parameters
|
| 54 |
+
|
| 55 |
+
The dashboard analyzes variations in:
|
| 56 |
+
- Embedding models
|
| 57 |
+
- Reranker models
|
| 58 |
+
- Summarization strategies
|
| 59 |
+
- Chunking strategies
|
| 60 |
+
- Retrieval strategies (Dense, Sparse, Hybrid)
|
| 61 |
+
- Hyperparameters (chunk size, overlap, alpha, top-k)
|
| 62 |
+
|
| 63 |
+
## Technology Stack
|
| 64 |
+
|
| 65 |
+
- **Framework:** Gradio 4.0+
|
| 66 |
+
- **Visualization:** Plotly Express
|
| 67 |
+
- **Data Processing:** Pandas
|
| 68 |
+
- **Backend:** FastAPI
|
| 69 |
+
|
| 70 |
+
## License
|
| 71 |
+
|
| 72 |
+
Apache 2.0
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
**Version:** v2.1.0-fixed | Built for AIML @ IIIT Hyderabad - TalentSprint
|
app.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
from fastapi import FastAPI
|
| 5 |
+
from typing import Dict
|
| 6 |
+
|
| 7 |
+
from config import METADATA_COLUMNS, DATA_FOLDER
|
| 8 |
+
from data_loader import load_csv_from_folder, get_available_datasets
|
| 9 |
+
|
| 10 |
+
app = FastAPI()
|
| 11 |
+
DB: Dict[str, pd.DataFrame] = {}
|
| 12 |
+
|
| 13 |
+
# --- 1. DATA PROCESSING FUNCTIONS ---
|
| 14 |
+
|
| 15 |
+
def analyze_domain_configs(df_subset):
|
| 16 |
+
"""Separates configuration columns into constants and variables for a domain."""
|
| 17 |
+
actual_cols = [c for c in df_subset.columns if c not in METADATA_COLUMNS]
|
| 18 |
+
|
| 19 |
+
constants = {}
|
| 20 |
+
variables = []
|
| 21 |
+
|
| 22 |
+
for col in actual_cols:
|
| 23 |
+
unique_vals = df_subset[col].astype(str).unique()
|
| 24 |
+
if len(unique_vals) <= 1:
|
| 25 |
+
constants[col] = unique_vals[0] if len(unique_vals) > 0 else "N/A"
|
| 26 |
+
else:
|
| 27 |
+
variables.append(col)
|
| 28 |
+
|
| 29 |
+
return constants, variables
|
| 30 |
+
|
| 31 |
+
def load_data() -> str:
|
| 32 |
+
"""Loads data from the configured data folder."""
|
| 33 |
+
try:
|
| 34 |
+
df, status_msg = load_csv_from_folder(DATA_FOLDER)
|
| 35 |
+
if not df.empty:
|
| 36 |
+
DB["data"] = df
|
| 37 |
+
return status_msg
|
| 38 |
+
except Exception as e:
|
| 39 |
+
return f"Error loading data: {str(e)}"
|
| 40 |
+
|
| 41 |
+
# --- 2. UI LOGIC ---
|
| 42 |
+
|
| 43 |
+
def get_dataset_choices():
|
| 44 |
+
"""Safely retrieves dataset choices for dropdown."""
|
| 45 |
+
try:
|
| 46 |
+
if "data" in DB and not DB["data"].empty:
|
| 47 |
+
return get_available_datasets(DB["data"])
|
| 48 |
+
return []
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"Error getting dataset choices: {e}")
|
| 51 |
+
return []
|
| 52 |
+
|
| 53 |
+
def get_data_preview():
|
| 54 |
+
"""Returns the raw dataframe for inspection"""
|
| 55 |
+
if "data" not in DB:
|
| 56 |
+
return pd.DataFrame()
|
| 57 |
+
return DB["data"].head(10)
|
| 58 |
+
|
| 59 |
+
def get_domain_state(dataset):
|
| 60 |
+
empty_update = gr.update(visible=False, value=None, choices=[])
|
| 61 |
+
|
| 62 |
+
if "data" not in DB:
|
| 63 |
+
return "", empty_update, empty_update, empty_update
|
| 64 |
+
|
| 65 |
+
df = DB["data"]
|
| 66 |
+
subset = df[df['dataset_name'] == dataset]
|
| 67 |
+
|
| 68 |
+
if subset.empty:
|
| 69 |
+
return "No data for this domain.", empty_update, empty_update, empty_update
|
| 70 |
+
|
| 71 |
+
consts, vars_list = analyze_domain_configs(subset)
|
| 72 |
+
const_text = "CONSTANTS (Fixed for this domain):\n" + "\n".join([f"{k}: {v}" for k,v in consts.items()])
|
| 73 |
+
|
| 74 |
+
updates = []
|
| 75 |
+
for i in range(3):
|
| 76 |
+
if i < len(vars_list):
|
| 77 |
+
col_name = vars_list[i]
|
| 78 |
+
unique_choices = list(subset[col_name].astype(str).unique())
|
| 79 |
+
unique_choices.insert(0, "All")
|
| 80 |
+
updates.append(gr.update(
|
| 81 |
+
label=f"Filter by {col_name}",
|
| 82 |
+
choices=unique_choices,
|
| 83 |
+
value="All",
|
| 84 |
+
visible=True,
|
| 85 |
+
interactive=True
|
| 86 |
+
))
|
| 87 |
+
else:
|
| 88 |
+
updates.append(empty_update)
|
| 89 |
+
|
| 90 |
+
return const_text, updates[0], updates[1], updates[2]
|
| 91 |
+
|
| 92 |
+
def plot_metrics_on_x_axis(dataset, f1_val, f2_val, f3_val):
|
| 93 |
+
"""Generates RMSE and Performance metric plots for selected domain and filters."""
|
| 94 |
+
if "data" not in DB or not dataset:
|
| 95 |
+
return None, None
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
df = DB["data"]
|
| 99 |
+
subset = df[df['dataset_name'] == dataset].copy()
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print(f"Error accessing data: {e}")
|
| 102 |
+
return None, None
|
| 103 |
+
|
| 104 |
+
# Filter Logic
|
| 105 |
+
_, vars_list = analyze_domain_configs(subset)
|
| 106 |
+
filters = [f1_val, f2_val, f3_val]
|
| 107 |
+
for i, val in enumerate(filters):
|
| 108 |
+
if i < len(vars_list) and val != "All" and val is not None:
|
| 109 |
+
col = vars_list[i]
|
| 110 |
+
subset = subset[subset[col].astype(str) == str(val)].copy() # Explicit copy
|
| 111 |
+
|
| 112 |
+
if subset.empty:
|
| 113 |
+
return None, None
|
| 114 |
+
|
| 115 |
+
# Reset index to avoid any index-related issues
|
| 116 |
+
subset = subset.reset_index(drop=True)
|
| 117 |
+
|
| 118 |
+
# Create Legend Label
|
| 119 |
+
# Ensure test_id is string to prevent errors
|
| 120 |
+
subset['Legend'] = "Test " + subset['test_id'].astype(str) + ": " + subset['config_purpose'].astype(str)
|
| 121 |
+
|
| 122 |
+
# --- PLOT 1: RMSE ---
|
| 123 |
+
# Check if columns exist before melting
|
| 124 |
+
rmse_cols = ['rmse_relevance', 'rmse_utilization', 'rmse_completeness']
|
| 125 |
+
available_rmse = [c for c in rmse_cols if c in subset.columns]
|
| 126 |
+
|
| 127 |
+
if available_rmse:
|
| 128 |
+
rmse_melted = subset.melt(
|
| 129 |
+
id_vars=['Legend', 'test_id'],
|
| 130 |
+
value_vars=available_rmse,
|
| 131 |
+
var_name='Metric Name',
|
| 132 |
+
value_name='Score'
|
| 133 |
+
)
|
| 134 |
+
# Explicitly ensure Score is numeric float
|
| 135 |
+
rmse_melted['Score'] = pd.to_numeric(rmse_melted['Score'], errors='coerce').fillna(0.0).astype(float)
|
| 136 |
+
rmse_melted['Metric Name'] = rmse_melted['Metric Name'].str.replace('rmse_', '').str.capitalize()
|
| 137 |
+
rmse_melted = rmse_melted.reset_index(drop=True)
|
| 138 |
+
|
| 139 |
+
# DEBUG: Print to verify values
|
| 140 |
+
print(f"[DEBUG] RMSE melted data - Score range: {rmse_melted['Score'].min():.4f} to {rmse_melted['Score'].max():.4f}")
|
| 141 |
+
print(f"[DEBUG] Sample scores: {rmse_melted['Score'].head(6).tolist()}")
|
| 142 |
+
|
| 143 |
+
fig_rmse = px.bar(
|
| 144 |
+
rmse_melted,
|
| 145 |
+
x="Metric Name",
|
| 146 |
+
y="Score",
|
| 147 |
+
color="Legend",
|
| 148 |
+
barmode="group",
|
| 149 |
+
title=f"RMSE Breakdown (Lower is Better) - {len(subset)} Tests",
|
| 150 |
+
text_auto='.3f'
|
| 151 |
+
)
|
| 152 |
+
fig_rmse.update_traces(textposition='outside')
|
| 153 |
+
fig_rmse.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
|
| 154 |
+
else:
|
| 155 |
+
fig_rmse = None
|
| 156 |
+
|
| 157 |
+
# --- PLOT 2: Performance ---
|
| 158 |
+
perf_cols = ['f1_score', 'aucroc']
|
| 159 |
+
available_perf = [c for c in perf_cols if c in subset.columns]
|
| 160 |
+
|
| 161 |
+
if available_perf:
|
| 162 |
+
perf_melted = subset.melt(
|
| 163 |
+
id_vars=['Legend', 'test_id'],
|
| 164 |
+
value_vars=available_perf,
|
| 165 |
+
var_name='Metric Name',
|
| 166 |
+
value_name='Score'
|
| 167 |
+
)
|
| 168 |
+
# Explicitly ensure Score is numeric float
|
| 169 |
+
perf_melted['Score'] = pd.to_numeric(perf_melted['Score'], errors='coerce').fillna(0.0).astype(float)
|
| 170 |
+
perf_melted['Metric Name'] = perf_melted['Metric Name'].replace({
|
| 171 |
+
'f1_score': 'F1 Score', 'aucroc': 'AUC-ROC'
|
| 172 |
+
})
|
| 173 |
+
perf_melted = perf_melted.reset_index(drop=True)
|
| 174 |
+
|
| 175 |
+
# DEBUG: Print to verify values
|
| 176 |
+
print(f"[DEBUG] Performance melted data - Score range: {perf_melted['Score'].min():.4f} to {perf_melted['Score'].max():.4f}")
|
| 177 |
+
print(f"[DEBUG] Sample scores: {perf_melted['Score'].head(6).tolist()}")
|
| 178 |
+
|
| 179 |
+
fig_perf = px.bar(
|
| 180 |
+
perf_melted,
|
| 181 |
+
x="Metric Name",
|
| 182 |
+
y="Score",
|
| 183 |
+
color="Legend",
|
| 184 |
+
barmode="group",
|
| 185 |
+
title=f"Performance Metrics (Higher is Better) - {len(subset)} Tests",
|
| 186 |
+
text_auto='.3f'
|
| 187 |
+
)
|
| 188 |
+
fig_perf.update_traces(textposition='outside')
|
| 189 |
+
fig_perf.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
|
| 190 |
+
else:
|
| 191 |
+
fig_perf = None
|
| 192 |
+
|
| 193 |
+
return fig_rmse, fig_perf
|
| 194 |
+
|
| 195 |
+
def generate_inter_domain_comparison():
|
| 196 |
+
"""Generates comparison table and plot across all domains."""
|
| 197 |
+
if "data" not in DB:
|
| 198 |
+
return pd.DataFrame(), None
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
df = DB["data"]
|
| 202 |
+
except Exception as e:
|
| 203 |
+
print(f"Error accessing data: {e}")
|
| 204 |
+
return pd.DataFrame(), None
|
| 205 |
+
datasets = df['dataset_name'].unique()
|
| 206 |
+
|
| 207 |
+
all_keys = set()
|
| 208 |
+
domain_constants = {}
|
| 209 |
+
|
| 210 |
+
for ds in datasets:
|
| 211 |
+
subset = df[df['dataset_name'] == ds]
|
| 212 |
+
consts, _ = analyze_domain_configs(subset)
|
| 213 |
+
domain_constants[ds] = consts
|
| 214 |
+
all_keys.update(consts.keys())
|
| 215 |
+
|
| 216 |
+
table_rows = []
|
| 217 |
+
for key in sorted(list(all_keys)):
|
| 218 |
+
row = {"Configuration Parameter": key}
|
| 219 |
+
for ds in datasets:
|
| 220 |
+
val = domain_constants[ds].get(key, "Variable")
|
| 221 |
+
row[ds] = val
|
| 222 |
+
table_rows.append(row)
|
| 223 |
+
|
| 224 |
+
comp_df = pd.DataFrame(table_rows)
|
| 225 |
+
|
| 226 |
+
best_results = []
|
| 227 |
+
for ds in datasets:
|
| 228 |
+
subset = df[df['dataset_name'] == ds]
|
| 229 |
+
if 'f1_score' in subset.columns:
|
| 230 |
+
max_f1 = subset['f1_score'].max()
|
| 231 |
+
best_idx = subset['f1_score'].idxmax()
|
| 232 |
+
best_row = subset.loc[best_idx]
|
| 233 |
+
best_results.append({
|
| 234 |
+
"Domain": ds,
|
| 235 |
+
"Max F1 Score": max_f1,
|
| 236 |
+
"Best Config": best_row['config_purpose']
|
| 237 |
+
})
|
| 238 |
+
|
| 239 |
+
if best_results:
|
| 240 |
+
best_df = pd.DataFrame(best_results)
|
| 241 |
+
fig_global = px.bar(
|
| 242 |
+
best_df, x="Domain", y="Max F1 Score",
|
| 243 |
+
color="Domain",
|
| 244 |
+
text_auto='.4f',
|
| 245 |
+
hover_data=["Best Config"],
|
| 246 |
+
title="Peak Performance per Domain (Max F1 Score)"
|
| 247 |
+
)
|
| 248 |
+
fig_global.update_traces(textposition='outside')
|
| 249 |
+
else:
|
| 250 |
+
fig_global = None
|
| 251 |
+
|
| 252 |
+
return comp_df, fig_global
|
| 253 |
+
|
| 254 |
+
# --- 3. UI ---
|
| 255 |
+
APP_VERSION = "v2.1.0-fixed" # Version stamp to verify code is updated
|
| 256 |
+
|
| 257 |
+
with gr.Blocks(title="RAG Analytics Pro", theme=gr.themes.Soft()) as demo:
|
| 258 |
+
gr.Markdown("## RAG Pipeline Analytics")
|
| 259 |
+
gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
|
| 260 |
+
|
| 261 |
+
with gr.Row():
|
| 262 |
+
refresh_data_btn = gr.Button("Load/Refresh Data", variant="primary")
|
| 263 |
+
status = gr.Textbox(label="Status (Check here for debug info)", interactive=False, scale=3)
|
| 264 |
+
|
| 265 |
+
with gr.Tabs():
|
| 266 |
+
# TAB 1: Main Analytics
|
| 267 |
+
with gr.TabItem("Intra-Domain Analysis"):
|
| 268 |
+
with gr.Row():
|
| 269 |
+
with gr.Column(scale=1):
|
| 270 |
+
ds_dropdown = gr.Dropdown(label="1. Select Domain", choices=[], interactive=True)
|
| 271 |
+
constants_box = gr.Textbox(label="Domain Constants", lines=5, interactive=False)
|
| 272 |
+
|
| 273 |
+
gr.Markdown("### Filter Tests")
|
| 274 |
+
filter_1 = gr.Dropdown(visible=False)
|
| 275 |
+
filter_2 = gr.Dropdown(visible=False)
|
| 276 |
+
filter_3 = gr.Dropdown(visible=False)
|
| 277 |
+
|
| 278 |
+
with gr.Column(scale=3):
|
| 279 |
+
plot_r = gr.Plot(label="RMSE Comparison")
|
| 280 |
+
plot_p = gr.Plot(label="Performance Comparison")
|
| 281 |
+
|
| 282 |
+
# TAB 2: Data Inspector
|
| 283 |
+
with gr.TabItem("Data Preview"):
|
| 284 |
+
gr.Markdown("### Verify your data loaded correctly here")
|
| 285 |
+
preview_table = gr.Dataframe(interactive=False)
|
| 286 |
+
preview_btn = gr.Button("Refresh Data Preview")
|
| 287 |
+
|
| 288 |
+
# TAB 3: Comparison
|
| 289 |
+
with gr.TabItem("Inter-Domain Comparison"):
|
| 290 |
+
refresh_btn = gr.Button("Generate Comparison")
|
| 291 |
+
gr.Markdown("### Configuration Differences")
|
| 292 |
+
comp_table = gr.Dataframe(interactive=False)
|
| 293 |
+
gr.Markdown("### Peak Performance")
|
| 294 |
+
global_plot = gr.Plot()
|
| 295 |
+
|
| 296 |
+
# EVENTS
|
| 297 |
+
refresh_data_btn.click(
|
| 298 |
+
load_data, inputs=None, outputs=[status]
|
| 299 |
+
).then(
|
| 300 |
+
lambda: gr.Dropdown(choices=get_dataset_choices()),
|
| 301 |
+
outputs=[ds_dropdown]
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
ds_dropdown.change(
|
| 305 |
+
get_domain_state,
|
| 306 |
+
inputs=[ds_dropdown],
|
| 307 |
+
outputs=[constants_box, filter_1, filter_2, filter_3]
|
| 308 |
+
).then(
|
| 309 |
+
plot_metrics_on_x_axis,
|
| 310 |
+
inputs=[ds_dropdown, filter_1, filter_2, filter_3],
|
| 311 |
+
outputs=[plot_r, plot_p]
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
gr.on(
|
| 315 |
+
triggers=[filter_1.change, filter_2.change, filter_3.change],
|
| 316 |
+
fn=plot_metrics_on_x_axis,
|
| 317 |
+
inputs=[ds_dropdown, filter_1, filter_2, filter_3],
|
| 318 |
+
outputs=[plot_r, plot_p]
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
# Debug Preview Events
|
| 322 |
+
preview_btn.click(get_data_preview, inputs=None, outputs=preview_table)
|
| 323 |
+
|
| 324 |
+
refresh_btn.click(
|
| 325 |
+
generate_inter_domain_comparison,
|
| 326 |
+
inputs=None,
|
| 327 |
+
outputs=[comp_table, global_plot]
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
# Auto-load data on startup
|
| 331 |
+
print(f"Loading data from {DATA_FOLDER}...")
|
| 332 |
+
startup_status = load_data()
|
| 333 |
+
print(startup_status)
|
| 334 |
+
|
| 335 |
+
app = gr.mount_gradio_app(app, demo, path="/")
|
config.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration file for RAG Analytics Application
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Data folder configuration
|
| 7 |
+
DATA_FOLDER = os.environ.get("DATA_FOLDER", "./data")
|
| 8 |
+
|
| 9 |
+
# Required columns after normalization
|
| 10 |
+
REQUIRED_COLUMNS = {
|
| 11 |
+
'test_id',
|
| 12 |
+
'config_purpose',
|
| 13 |
+
'dataset_name'
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
# Metric columns that need numeric conversion to float
|
| 17 |
+
METRIC_COLUMNS = [
|
| 18 |
+
'rmse_relevance',
|
| 19 |
+
'rmse_utilization',
|
| 20 |
+
'rmse_completeness',
|
| 21 |
+
'f1_score',
|
| 22 |
+
'aucroc',
|
| 23 |
+
'failed_samples'
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
# Numeric configuration columns (also need float conversion)
|
| 27 |
+
NUMERIC_CONFIG_COLUMNS = [
|
| 28 |
+
'chunk_size',
|
| 29 |
+
'overlap',
|
| 30 |
+
'stride',
|
| 31 |
+
'alpha',
|
| 32 |
+
'retr_k',
|
| 33 |
+
'final_k',
|
| 34 |
+
'summ_max',
|
| 35 |
+
'summ_min',
|
| 36 |
+
'test_id'
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
# Column mapping for normalization
|
| 40 |
+
COLUMN_MAP = {
|
| 41 |
+
'test': 'test_id',
|
| 42 |
+
'configurationpurpose': 'config_purpose',
|
| 43 |
+
'subsets': 'dataset_name',
|
| 44 |
+
'embeddingmodel': 'embedding_model',
|
| 45 |
+
'rerankermodel': 'reranker_model',
|
| 46 |
+
'summarizationmodel': 'summarization_model',
|
| 47 |
+
'chunkingstrategy': 'chunking_strategy',
|
| 48 |
+
'chunksize': 'chunk_size',
|
| 49 |
+
'overlap': 'overlap',
|
| 50 |
+
'stride': 'stride',
|
| 51 |
+
'retreivalstrategy': 'retrieval_strategy',
|
| 52 |
+
'retrievalstrategy': 'retrieval_strategy', # Catch typo
|
| 53 |
+
'alpha': 'alpha',
|
| 54 |
+
'retrk': 'retr_k',
|
| 55 |
+
'finalk': 'final_k',
|
| 56 |
+
'repacking': 'repacking',
|
| 57 |
+
'summmax': 'summ_max',
|
| 58 |
+
'summmin': 'summ_min',
|
| 59 |
+
'8bgptlabel': 'gpt_label',
|
| 60 |
+
|
| 61 |
+
# Metrics
|
| 62 |
+
'rmsetracerelevance': 'rmse_relevance',
|
| 63 |
+
'rmsetraceutilization': 'rmse_utilization',
|
| 64 |
+
'rmsetracecompleteness': 'rmse_completeness',
|
| 65 |
+
'aucroc': 'aucroc',
|
| 66 |
+
'f1score': 'f1_score',
|
| 67 |
+
'failedtotalsamples': 'failed_samples'
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# Metadata columns (excluded from constant/variable analysis)
|
| 71 |
+
METADATA_COLUMNS = [
|
| 72 |
+
'rmse_relevance', 'rmse_utilization', 'rmse_completeness',
|
| 73 |
+
'aucroc', 'f1_score', 'failed_samples',
|
| 74 |
+
'test_id', 'config_purpose', 'dataset_name'
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
# Debug mode
|
| 78 |
+
DEBUG = True
|
data/Biomedical-pubmedqa.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
|
| 2 |
+
1,Efficiency Baseline,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,None,hard_cut,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3677,0.3011,0.5556,0.604,0.5049,32/100
|
| 3 |
+
2,Chunking Proof,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,None,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3632,0.2886,0.5074,0.604,0.5049,29/100
|
| 4 |
+
3,Reranking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,None,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3289,0.2663,0.6015,0.482,0.38,8/100
|
| 5 |
+
4,Repacking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,None,hard_cut,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.2752,0.252,0.6246,0.5951,0.449,8/100
|
| 6 |
+
5,Prove Summarization,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,fangyuan/nq_abstractive_compressor,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.4934,1.0537,0.5161,cannot compute,0,9/100
|
| 7 |
+
6,Optimal Medical Hybrid,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,N/A,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3223,0.2733,0.6561,0.5053,0.3542,13/100
|
data/Finance-finqa.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
|
| 2 |
+
1,Efficiency Baseline,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,hard-cut,512,50,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1409,0.0831,0.6365,0.4263,0.099,18/100
|
| 3 |
+
2,Prove Chunking,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1667,0.1188,0.6431,0.4316,0.1176,23/100
|
| 4 |
+
3,Prove Hybrid/Rerank,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.1316,0.0693,0.6763,0.4263,0.099,11/100
|
| 5 |
+
4,Max Raw Context,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,hard-cut,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.1947,0.0795,0.7239,0.4316,0.1176,14/100
|
| 6 |
+
5,Golden Setup,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,fangyuan/nq_abstractive_compressor,sliding-window,512,100,412,Hybrid,0.6,50,5,reverse,200,20,long_cot,0.4158,0.8363,0.7073,Cannot compute (insufficient class variance),0,4/100
|
| 7 |
+
6,Optimized Financial,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,sliding-window,512,100,412,Hybrid,0.8,50,3,forward,N/A,N/A,long,0.2468,0.1679,0.6177,0.5474,0.1731,6/100
|
data/General-msmarco.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
|
| 2 |
+
1,Efficiency Baseline,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,hard-cut,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3252,0.1998,0.4362,0.5125,0.5954,23/100
|
| 3 |
+
2,Prove Chunking,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,sliding-window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3449,0.1947,0.4248,0.495,0.5625,30/100
|
| 4 |
+
3,Prove Hybrid/Rerank,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,N/A,sliding-window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3183,0.1793,0.407,0.5183,0.6061,22/100
|
| 5 |
+
4,Prove Repacking,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,N/A,hard-cut,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3416,0.1837,0.4491,0.559,0.6763,11/100
|
| 6 |
+
5,Prove Summarization,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,fangyuan/nq_abstractive_compressor,sliding-window,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.5066,0.8781,0.5049,N/A,0,3/100
|
| 7 |
+
6,Optimized Hybrid,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,N/A,hard-cut,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3292,0.1754,0.5477,0.4842,0.4706,0/100
|
data/Legal-cuad.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,% Failed Sample
|
| 2 |
+
1,Efficiency Baseline,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,hard-cut/ token aware chunking ,512,100,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2951,0.1697,0.6225,0.4321,0.3761,35.00%
|
| 3 |
+
2,Prove Chunking,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2927,0.1623,0.5612,0.4065,0.2609,32.00%
|
| 4 |
+
3,Prove Hybrid/Rerank,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.3087,0.1296,0.5315,0.5197,0.5543,15.00%
|
| 5 |
+
4,Max Raw Context,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,hard-cut/ token aware chunking ,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.3287,0.1429,0.6583,0.4132,0.3859,17.00%
|
| 6 |
+
5,Golden Setup,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,fangyuan/nq_abstractive_compressor,sliding-window,512,100,412,Hybrid,0.6,50,5,reverse,250,50,long_cot,0.5048,0.7648,0.4832,0.0215,0.5054,17.00%
|
data_loader.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data loading and processing module for RAG Analytics
|
| 3 |
+
"""
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Tuple, List
|
| 8 |
+
from config import DATA_FOLDER, COLUMN_MAP, METRIC_COLUMNS, NUMERIC_CONFIG_COLUMNS, REQUIRED_COLUMNS, DEBUG
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 12 |
+
"""
|
| 13 |
+
1. Renames columns by stripping special chars (spaces, =, -).
|
| 14 |
+
2. Forces metric columns to numeric (floats).
|
| 15 |
+
3. Retains all data without schema validation dropping rows.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
df: Raw dataframe loaded from CSV
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
Normalized dataframe with standardized column names and types
|
| 22 |
+
"""
|
| 23 |
+
rename_dict = {}
|
| 24 |
+
for col in df.columns:
|
| 25 |
+
# Aggressive clean: "RMSE=trace relevance" -> "rmsetracerelevance"
|
| 26 |
+
# Remove spaces, underscores, hyphens, equals signs
|
| 27 |
+
clean_col = "".join(ch for ch in str(col).lower() if ch.isalnum())
|
| 28 |
+
|
| 29 |
+
if clean_col in COLUMN_MAP:
|
| 30 |
+
rename_dict[col] = COLUMN_MAP[clean_col]
|
| 31 |
+
|
| 32 |
+
df = df.rename(columns=rename_dict)
|
| 33 |
+
|
| 34 |
+
# Force ALL metric columns to float64 (Coerce errors to NaN then 0.0)
|
| 35 |
+
# This ensures "Empty" strings or invalid values don't crash the graph
|
| 36 |
+
# Using astype(float) explicitly ensures floating-point display
|
| 37 |
+
for metric in METRIC_COLUMNS:
|
| 38 |
+
if metric in df.columns:
|
| 39 |
+
df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0.0).astype(float)
|
| 40 |
+
|
| 41 |
+
# Force ALL numeric configuration columns to float64
|
| 42 |
+
# This prevents integers like "256" from displaying as integers in graphs
|
| 43 |
+
for config_col in NUMERIC_CONFIG_COLUMNS:
|
| 44 |
+
if config_col in df.columns:
|
| 45 |
+
# Convert to numeric, but preserve N/A as NaN (don't fill)
|
| 46 |
+
df[config_col] = pd.to_numeric(df[config_col], errors='coerce').astype(float)
|
| 47 |
+
|
| 48 |
+
return df
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]:
|
| 52 |
+
"""
|
| 53 |
+
Validates that the dataframe has required columns.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
df: Dataframe to validate
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Tuple of (is_valid, error_message)
|
| 60 |
+
"""
|
| 61 |
+
missing_cols = REQUIRED_COLUMNS - set(df.columns)
|
| 62 |
+
|
| 63 |
+
if missing_cols:
|
| 64 |
+
return False, f"Missing required columns: {', '.join(missing_cols)}"
|
| 65 |
+
|
| 66 |
+
if df.empty:
|
| 67 |
+
return False, "Dataframe is empty"
|
| 68 |
+
|
| 69 |
+
return True, "Valid"
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def load_csv_from_folder(folder_path: str = None) -> Tuple[pd.DataFrame, str]:
|
| 73 |
+
"""
|
| 74 |
+
Loads all CSV files from the specified folder and combines them.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
folder_path: Path to folder containing CSV files. If None, uses DATA_FOLDER from config.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Tuple of (combined_dataframe, status_message)
|
| 81 |
+
"""
|
| 82 |
+
if folder_path is None:
|
| 83 |
+
folder_path = DATA_FOLDER
|
| 84 |
+
|
| 85 |
+
folder = Path(folder_path)
|
| 86 |
+
|
| 87 |
+
if not folder.exists():
|
| 88 |
+
return pd.DataFrame(), f"Error: Data folder '{folder_path}' does not exist."
|
| 89 |
+
|
| 90 |
+
if not folder.is_dir():
|
| 91 |
+
return pd.DataFrame(), f"Error: '{folder_path}' is not a directory."
|
| 92 |
+
|
| 93 |
+
# Find all CSV files
|
| 94 |
+
csv_files = list(folder.glob("*.csv"))
|
| 95 |
+
|
| 96 |
+
if not csv_files:
|
| 97 |
+
return pd.DataFrame(), f"Error: No CSV files found in '{folder_path}'."
|
| 98 |
+
|
| 99 |
+
all_dfs = []
|
| 100 |
+
loaded_files = []
|
| 101 |
+
errors = []
|
| 102 |
+
|
| 103 |
+
for csv_file in csv_files:
|
| 104 |
+
try:
|
| 105 |
+
# Load raw CSV
|
| 106 |
+
df_raw = pd.read_csv(csv_file, encoding='utf-8-sig')
|
| 107 |
+
|
| 108 |
+
# Normalize column names and types
|
| 109 |
+
df_clean = normalize_dataframe(df_raw)
|
| 110 |
+
|
| 111 |
+
# Validate
|
| 112 |
+
is_valid, error_msg = validate_dataframe(df_clean)
|
| 113 |
+
if not is_valid:
|
| 114 |
+
errors.append(f"{csv_file.name}: {error_msg}")
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
all_dfs.append(df_clean)
|
| 118 |
+
loaded_files.append(csv_file.name)
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
errors.append(f"{csv_file.name}: {str(e)}")
|
| 122 |
+
|
| 123 |
+
if not all_dfs:
|
| 124 |
+
error_summary = "\n".join(errors) if errors else "Unknown error"
|
| 125 |
+
return pd.DataFrame(), f"Error: Failed to load any valid CSV files.\n{error_summary}"
|
| 126 |
+
|
| 127 |
+
# Combine all dataframes
|
| 128 |
+
final_df = pd.concat(all_dfs, ignore_index=True)
|
| 129 |
+
|
| 130 |
+
# Build status message
|
| 131 |
+
status_parts = [f"Successfully loaded {len(final_df)} test runs from {len(loaded_files)} file(s):"]
|
| 132 |
+
status_parts.extend([f" • {fname}" for fname in loaded_files])
|
| 133 |
+
|
| 134 |
+
if errors:
|
| 135 |
+
status_parts.append(f"\n{len(errors)} file(s) skipped due to errors:")
|
| 136 |
+
status_parts.extend([f" • {err}" for err in errors])
|
| 137 |
+
|
| 138 |
+
# Add debug info if enabled
|
| 139 |
+
if DEBUG and not final_df.empty:
|
| 140 |
+
sample = final_df.iloc[0]
|
| 141 |
+
debug_info = f"\nDEBUG (Row 1): Relevance={sample.get('rmse_relevance', 'N/A')}, F1={sample.get('f1_score', 'N/A')}, AUCROC={sample.get('aucroc', 'N/A')}"
|
| 142 |
+
status_parts.append(debug_info)
|
| 143 |
+
|
| 144 |
+
return final_df, "\n".join(status_parts)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def get_available_datasets(df: pd.DataFrame) -> List[str]:
|
| 148 |
+
"""
|
| 149 |
+
Extracts unique dataset names from the dataframe.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
df: Dataframe containing dataset_name column
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
List of unique dataset names
|
| 156 |
+
"""
|
| 157 |
+
if df.empty or 'dataset_name' not in df.columns:
|
| 158 |
+
return []
|
| 159 |
+
|
| 160 |
+
return sorted(df['dataset_name'].unique().tolist())
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.44.0
|
| 2 |
+
plotly>=5.18.0
|
| 3 |
+
pandas>=2.0.0
|
| 4 |
+
fastapi
|
| 5 |
+
uvicorn
|
| 6 |
+
python-multipart
|