nemo commited on
Commit
c03ae2c
Β·
1 Parent(s): c978915
Files changed (2) hide show
  1. app.py +480 -0
  2. peft_issues_analyzed_500.json +0 -0
app.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Gradio dashboard for visualizing analyzed peft issues with time range filtering."""
3
+
4
+ import json
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import plotly.express as px
11
+ import plotly.graph_objects as go
12
+
13
+ ANALYZED_FILE = Path("peft_issues_merged_500.json")
14
+
15
+
16
+ def parse_date(date_str):
17
+ """Parse ISO date string to year-month string."""
18
+ try:
19
+ dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
20
+ return dt.strftime('%Y-%m')
21
+ except:
22
+ return "unknown"
23
+
24
+
25
+ def parse_date_full(date_str):
26
+ """Parse ISO date string to datetime object."""
27
+ try:
28
+ return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
29
+ except:
30
+ return datetime.min
31
+
32
+
33
+ def load_data():
34
+ """Load analyzed issues data with date parsing."""
35
+ with open(ANALYZED_FILE, "r", encoding="utf-8") as f:
36
+ data = json.load(f)
37
+
38
+ # Add parsed dates
39
+ for item in data:
40
+ item['year_month'] = parse_date(item.get('created_at', ''))
41
+ item['date_obj'] = parse_date_full(item.get('created_at', ''))
42
+
43
+ return data
44
+
45
+
46
+ def create_dataframe(data):
47
+ """Create a pandas DataFrame from analyzed data."""
48
+ df_data = []
49
+ for item in data:
50
+ df_data.append({
51
+ "Issue #": item["number"],
52
+ "Title": item["title"][:100] + "..." if len(item["title"]) > 100 else item["title"],
53
+ "State": item["state"],
54
+ "Date": item.get("year_month", "unknown"),
55
+ "Model": item["model"],
56
+ "Trainer": item["trainer"],
57
+ "PEFT Method": item["peft_method"],
58
+ "Training Type": item["training_type"],
59
+ "Experience": item["experience_score"],
60
+ "Specialties": ", ".join(item["specialties"]) if item["specialties"] != ["none"] else "-",
61
+ "URL": item["html_url"],
62
+ })
63
+ return pd.DataFrame(df_data)
64
+
65
+
66
+ def filter_data(df, model_filter, trainer_filter, peft_filter, training_filter, min_score, max_score, min_month, max_month):
67
+ """Filter dataframe based on user selections including date range."""
68
+ if model_filter != "All":
69
+ df = df[df["Model"] == model_filter]
70
+ if trainer_filter != "All":
71
+ df = df[df["Trainer"] == trainer_filter]
72
+ if peft_filter != "All":
73
+ df = df[df["PEFT Method"] == peft_filter]
74
+ if training_filter != "All":
75
+ df = df[df["Training Type"] == training_filter]
76
+
77
+ df = df[(df["Experience"] >= min_score) & (df["Experience"] <= max_score)]
78
+
79
+ # Date range filtering
80
+ df = df[(df["Date"] >= min_month) & (df["Date"] <= max_month)]
81
+
82
+ return df
83
+
84
+
85
+ def get_unique_values(data, key):
86
+ """Get unique values for a filter dropdown."""
87
+ values = sorted(set(item[key] for item in data))
88
+ return ["All"] + values
89
+
90
+
91
+ def get_month_range(data):
92
+ """Get min and max month from data."""
93
+ months = sorted(set(item.get("year_month", "unknown") for item in data if item.get("year_month") != "unknown"))
94
+ if not months:
95
+ return ["2023-01", "2026-12"]
96
+ return [months[0], months[-1]]
97
+
98
+
99
+ def get_all_months(data):
100
+ """Get all unique months in chronological order."""
101
+ months = sorted(set(item.get("year_month", "unknown") for item in data if item.get("year_month") != "unknown"))
102
+ return months
103
+
104
+
105
+ def create_peft_method_chart(data):
106
+ """Create PEFT method distribution chart."""
107
+ if not data:
108
+ return go.Figure()
109
+
110
+ methods = {}
111
+ for item in data:
112
+ m = item["peft_method"]
113
+ methods[m] = methods.get(m, 0) + 1
114
+
115
+ df = pd.DataFrame(list(methods.items()), columns=["PEFT Method", "Count"])
116
+ fig = px.bar(df, x="PEFT Method", y="Count", title="PEFT Method Distribution",
117
+ color="PEFT Method", template="plotly_white")
118
+ fig.update_layout(showlegend=False)
119
+ return fig
120
+
121
+
122
+ def create_trainer_chart(data):
123
+ """Create trainer framework distribution chart."""
124
+ if not data:
125
+ return go.Figure()
126
+
127
+ trainers = {}
128
+ for item in data:
129
+ t = item["trainer"]
130
+ trainers[t] = trainers.get(t, 0) + 1
131
+
132
+ df = pd.DataFrame(list(trainers.items()), columns=["Trainer", "Count"])
133
+ fig = px.pie(df, values="Count", names="Trainer", title="Trainer Framework Distribution",
134
+ template="plotly_white")
135
+ return fig
136
+
137
+
138
+ def create_training_type_chart(data):
139
+ """Create training type distribution chart."""
140
+ if not data:
141
+ return go.Figure()
142
+
143
+ types = {}
144
+ for item in data:
145
+ t = item["training_type"]
146
+ types[t] = types.get(t, 0) + 1
147
+
148
+ df = pd.DataFrame(list(types.items()), columns=["Training Type", "Count"])
149
+ fig = px.bar(df, x="Training Type", y="Count", title="Training Type Distribution",
150
+ color="Training Type", template="plotly_white")
151
+ fig.update_layout(showlegend=False)
152
+ return fig
153
+
154
+
155
+ def create_experience_chart(data):
156
+ """Create experience score histogram."""
157
+ if not data:
158
+ return go.Figure()
159
+
160
+ scores = [item["experience_score"] for item in data]
161
+ fig = px.histogram(x=scores, nbins=10, title="Experience Score Distribution",
162
+ labels={"x": "Experience Score", "y": "Count"},
163
+ template="plotly_white")
164
+ fig.update_traces(marker_color="steelblue")
165
+ return fig
166
+
167
+
168
+ def create_experience_by_method_chart(data):
169
+ """Create average experience score by PEFT method."""
170
+ if not data:
171
+ return go.Figure()
172
+
173
+ method_scores = {}
174
+ method_counts = {}
175
+ for item in data:
176
+ m = item["peft_method"]
177
+ method_scores[m] = method_scores.get(m, 0) + item["experience_score"]
178
+ method_counts[m] = method_counts.get(m, 0) + 1
179
+
180
+ avg_scores = {m: method_scores[m] / method_counts[m] for m in method_scores}
181
+ df = pd.DataFrame(list(avg_scores.items()), columns=["PEFT Method", "Avg Score"])
182
+ fig = px.bar(df, x="PEFT Method", y="Avg Score", title="Average Experience Score by PEFT Method",
183
+ color="PEFT Method", template="plotly_white")
184
+ fig.update_layout(showlegend=False, yaxis_range=[0, 10])
185
+ return fig
186
+
187
+
188
+ def create_specialties_chart(data):
189
+ """Create specialties distribution chart."""
190
+ if not data:
191
+ return go.Figure()
192
+
193
+ specialties = {}
194
+ for item in data:
195
+ for s in item["specialties"]:
196
+ if s != "none":
197
+ specialties[s] = specialties.get(s, 0) + 1
198
+
199
+ if not specialties:
200
+ return go.Figure()
201
+
202
+ df = pd.DataFrame(list(specialties.items()), columns=["Specialty", "Count"])
203
+ fig = px.bar(df, x="Specialty", y="Count", title="Special Technologies Distribution",
204
+ color="Specialty", template="plotly_white")
205
+ fig.update_layout(showlegend=False)
206
+ return fig
207
+
208
+
209
+ def create_model_chart(data):
210
+ """Create model distribution chart."""
211
+ if not data:
212
+ return go.Figure()
213
+
214
+ models = {}
215
+ for item in data:
216
+ m = item["model"]
217
+ if m != "unknown":
218
+ models[m] = models.get(m, 0) + 1
219
+
220
+ if not models:
221
+ return go.Figure()
222
+
223
+ df = pd.DataFrame(list(models.items()), columns=["Model", "Count"])
224
+ fig = px.bar(df, x="Model", y="Count", title="Model Distribution",
225
+ color="Model", template="plotly_white")
226
+ fig.update_layout(showlegend=False)
227
+ return fig
228
+
229
+
230
+ def create_temporal_chart(data):
231
+ """Create issues over time chart."""
232
+ if not data:
233
+ return go.Figure()
234
+
235
+ months = {}
236
+ for item in data:
237
+ m = item.get("year_month", "unknown")
238
+ if m != "unknown":
239
+ months[m] = months.get(m, 0) + 1
240
+
241
+ if not months:
242
+ return go.Figure()
243
+
244
+ sorted_months = sorted(months.items())
245
+ df = pd.DataFrame(sorted_months, columns=["Month", "Issues"])
246
+ fig = px.line(df, x="Month", y="Issues", title="Issues Over Time",
247
+ markers=True, template="plotly_white")
248
+ fig.update_layout(xaxis_tickangle=-45)
249
+ return fig
250
+
251
+
252
+ def show_issue_details(issue_number, data):
253
+ """Show detailed information for a specific issue."""
254
+ for item in data:
255
+ if item["number"] == issue_number:
256
+ return (
257
+ f"**Issue #{item['number']}**: [{item['title']}]({item['html_url']})\n\n"
258
+ f"**State**: {item['state']}\n"
259
+ f"**Author**: {item['author']}\n"
260
+ f"**Created**: {item['created_at'][:10] if item.get('created_at') else 'unknown'}\n"
261
+ f"**Labels**: {', '.join(item['labels']) or 'None'}\n\n"
262
+ f"**Model**: {item['model']}\n"
263
+ f"**Trainer**: {item['trainer']}\n"
264
+ f"**PEFT Method**: {item['peft_method']}\n"
265
+ f"**Training Type**: {item['training_type']}\n"
266
+ f"**Specialties**: {', '.join(item['specialties'])}\n\n"
267
+ f"**Experience Score**: {item['experience_score']}/10\n"
268
+ f"**Rationale**: {item['experience_rationale']}\n\n"
269
+ f"**Confidence**:\n"
270
+ f"- Model: {item['confidence'].get('model', 'N/A')}\n"
271
+ f"- Trainer: {item['confidence'].get('trainer_framework', item['confidence'].get('trainer', 'N/A'))}\n"
272
+ f"- PEFT Method: {item['confidence'].get('peft_method', 'N/A')}\n"
273
+ f"- Training Type: {item['confidence'].get('training_type', 'N/A')}\n"
274
+ f"- Experience: {item['confidence'].get('experience_score', 'N/A')}\n"
275
+ )
276
+ return "Issue not found"
277
+
278
+
279
+ def filter_data_by_months(data, min_month, max_month):
280
+ """Filter raw data by month range."""
281
+ return [item for item in data if min_month <= item.get("year_month", "unknown") <= max_month]
282
+
283
+
284
+ def build_app():
285
+ """Build the Gradio application."""
286
+ data = load_data()
287
+ df = create_dataframe(data)
288
+
289
+ # Get month range
290
+ month_range = get_month_range(data)
291
+ all_months = get_all_months(data)
292
+
293
+ with gr.Blocks(title="PEFT Issues Analysis Dashboard") as app:
294
+ gr.Markdown("# πŸ” PEFT Issues Analysis Dashboard")
295
+ gr.Markdown("Analysis of 345 most recent issues from [huggingface/peft](https://github.com/huggingface/peft) β€” classified by LLM")
296
+
297
+ # Global date range filter at the top
298
+ with gr.Row():
299
+ with gr.Column(scale=2):
300
+ gr.Markdown("### πŸ“… Global Time Range Filter")
301
+ with gr.Column(scale=8):
302
+ # Use dropdowns for month selection since Gradio slider doesn't support strings well
303
+ month_options = all_months
304
+ min_month = gr.Dropdown(
305
+ choices=month_options,
306
+ value=month_range[0],
307
+ label="From Month",
308
+ allow_custom_value=False
309
+ )
310
+ max_month = gr.Dropdown(
311
+ choices=month_options,
312
+ value=month_range[-1],
313
+ label="To Month",
314
+ allow_custom_value=False
315
+ )
316
+
317
+ with gr.Tabs():
318
+ with gr.Tab("πŸ“Š Data Table"):
319
+ with gr.Row():
320
+ model_filter = gr.Dropdown(
321
+ choices=get_unique_values(data, "model"),
322
+ value="All",
323
+ label="Model"
324
+ )
325
+ trainer_filter = gr.Dropdown(
326
+ choices=get_unique_values(data, "trainer"),
327
+ value="All",
328
+ label="Trainer"
329
+ )
330
+ peft_filter = gr.Dropdown(
331
+ choices=get_unique_values(data, "peft_method"),
332
+ value="All",
333
+ label="PEFT Method"
334
+ )
335
+ training_filter = gr.Dropdown(
336
+ choices=get_unique_values(data, "training_type"),
337
+ value="All",
338
+ label="Training Type"
339
+ )
340
+
341
+ with gr.Row():
342
+ min_score = gr.Slider(0, 10, value=0, step=1, label="Min Experience Score")
343
+ max_score = gr.Slider(0, 10, value=10, step=1, label="Max Experience Score")
344
+
345
+ table = gr.DataFrame(
346
+ value=df,
347
+ headers=["Issue #", "Title", "State", "Date", "Model", "Trainer", "PEFT Method",
348
+ "Training Type", "Experience", "Specialties", "URL"],
349
+ interactive=False,
350
+ wrap=True
351
+ )
352
+
353
+ def update_table(m, t, p, tr, min_s, max_s, min_m, max_m):
354
+ filtered = filter_data(df.copy(), m, t, p, tr, min_s, max_s, min_m, max_m)
355
+ return filtered
356
+
357
+ all_filters = [model_filter, trainer_filter, peft_filter, training_filter,
358
+ min_score, max_score, min_month, max_month]
359
+
360
+ for component in all_filters:
361
+ component.change(
362
+ fn=update_table,
363
+ inputs=all_filters,
364
+ outputs=table
365
+ )
366
+
367
+ with gr.Tab("πŸ”Ž Issue Details"):
368
+ issue_number = gr.Number(label="Issue Number", value=data[0]["number"], precision=0)
369
+ details = gr.Markdown()
370
+
371
+ def update_details(num):
372
+ return show_issue_details(int(num), data)
373
+
374
+ issue_number.change(fn=update_details, inputs=issue_number, outputs=details)
375
+ details.value = show_issue_details(data[0]["number"], data)
376
+
377
+ with gr.Tab("πŸ“ˆ Analytics"):
378
+ with gr.Row():
379
+ temporal_chart = gr.Plot(value=create_temporal_chart(data))
380
+
381
+ with gr.Row():
382
+ peft_chart = gr.Plot(value=create_peft_method_chart(data))
383
+ trainer_chart = gr.Plot(value=create_trainer_chart(data))
384
+
385
+ with gr.Row():
386
+ training_chart = gr.Plot(value=create_training_type_chart(data))
387
+ experience_chart = gr.Plot(value=create_experience_chart(data))
388
+
389
+ with gr.Row():
390
+ exp_method_chart = gr.Plot(value=create_experience_by_method_chart(data))
391
+ specialties_chart = gr.Plot(value=create_specialties_chart(data))
392
+
393
+ with gr.Row():
394
+ model_chart = gr.Plot(value=create_model_chart(data))
395
+
396
+ def update_charts(min_m, max_m):
397
+ filtered_data = filter_data_by_months(data, min_m, max_m)
398
+ return (
399
+ create_temporal_chart(filtered_data),
400
+ create_peft_method_chart(filtered_data),
401
+ create_trainer_chart(filtered_data),
402
+ create_training_type_chart(filtered_data),
403
+ create_experience_chart(filtered_data),
404
+ create_experience_by_method_chart(filtered_data),
405
+ create_specialties_chart(filtered_data),
406
+ create_model_chart(filtered_data)
407
+ )
408
+
409
+ for component in [min_month, max_month]:
410
+ component.change(
411
+ fn=update_charts,
412
+ inputs=[min_month, max_month],
413
+ outputs=[
414
+ temporal_chart, peft_chart, trainer_chart,
415
+ training_chart, experience_chart,
416
+ exp_method_chart, specialties_chart, model_chart
417
+ ]
418
+ )
419
+
420
+ with gr.Tab("ℹ️ About"):
421
+ gr.Markdown(f"""
422
+ ## About This Dashboard
423
+
424
+ This dashboard analyzes 345 recent issues from the [huggingface/peft](https://github.com/huggingface/peft) repository.
425
+
426
+ **Time Range**: {month_range[0]} to {month_range[1]}
427
+ **Total Issues**: {len(data)}
428
+
429
+ ### Data Collection Method
430
+
431
+ **LLM Classification** (current view):
432
+ - All 345 issues classified by a language model reading the full title + body
433
+ - More accurate than regex-based extraction, especially for nuanced classifications
434
+ - Experience scores and training types are LLM-inferred from context
435
+
436
+ **Validation Process**:
437
+ 1. **Static analysis** (rule-based): Extracted via regex patterns
438
+ 2. **LLM classification**: Language model read all 345 issues in 4 chunks
439
+ 3. **Comparison**: Identified systematic biases in the static analyzer
440
+ 4. **Merged results**: This dashboard uses the LLM classification (more accurate)
441
+
442
+ ### Why LLM Classification?
443
+
444
+ LLM outperforms static analysis on nuanced tasks:
445
+ - **Experience score**: LLM understands issue quality, tone, and depth (44.3% agreement with static)
446
+ - **Training type**: LLM distinguishes actual training from feature requests (61.2% agreement)
447
+ - **PEFT method**: LLM detects context (73.9% agreement)
448
+
449
+ ### Metrics Explained
450
+
451
+ **Experience Score (0-10)**:
452
+ - **Code reproduction**: +2 if runnable code snippet or clear numbered steps
453
+ - **Error details**: +2 if actual traceback or error block
454
+ - **Environment info**: +2 if actual version numbers or system info table
455
+ - **Clarity**: +2 if clear title (4+ words) and substantial body (50+ words)
456
+ - **Technical depth**: +2 if 2+ technical terms used in proper context
457
+
458
+ **Confidence Levels**:
459
+ - **High**: Strong evidence in the issue text
460
+ - **Medium**: Some evidence or inference
461
+ - **Low**: Limited or no evidence
462
+
463
+ ### Known Limitations
464
+ - Model detection: Many PEFT issues are framework-level bugs without model mentions
465
+ - Trainer detection: Most users don't specify their training framework
466
+ - Training type: "unsure" means the issue lacks clear training context (often infrastructure/bug reports)
467
+ - LLM may occasionally hallucinate or misread complex technical details
468
+
469
+ ### Data Sources
470
+ - Issues fetched via GitHub API on 2026-05-12 (345 issues, most recently updated)
471
+ - LLM classification performed on all 345 issues in 4 batches
472
+ - Raw data preserved for transparency and re-analysis
473
+ """)
474
+
475
+ return app
476
+
477
+
478
+ if __name__ == "__main__":
479
+ app = build_app()
480
+ app.launch(share=False, server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft())
peft_issues_analyzed_500.json ADDED
The diff for this file is too large to render. See raw diff