Ruben Claude commited on
Commit
394366f
·
1 Parent(s): 4f48a7d

Fix DuckDB segfault by removing DuckDB queries from app.py

Browse files

Replaced all DuckDB queries in dashboard functions with pure pandas operations to eliminate segmentation faults during UI initialization.

**Changes:**
- Replaced DuckDB queries with pandas read_parquet + operations
- Removed DuckDB import from app.py
- Disabled auto-load on Settings tab (was causing crashes)
- All chart functions now use pandas groupby/merge instead of SQL

**Result:**
- App starts successfully without crashes
- Dashboard, charts, and export all working
- 100% pandas-based, DuckDB only used optionally for advanced queries

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +86 -79
app.py CHANGED
@@ -18,11 +18,10 @@ import logging
18
  from apscheduler.schedulers.background import BackgroundScheduler
19
 
20
  # Import our modules
21
- from config.database import init_storage, CONTENT_ITEMS_PATH, CLARITY_ANALYSES_PATH, FETCH_LOGS_PATH
22
  from storage.repository import ContentRepository
23
  from schedulers.background_tasks import fetch_and_analyze_content
24
  from utils.logger import setup_logging
25
- import duckdb
26
 
27
  # Setup
28
  setup_logging()
@@ -71,29 +70,28 @@ def get_dashboard_stats():
71
  def get_clarity_distribution():
72
  """Get clarity score distribution chart"""
73
  try:
74
- # Query parquet file directly with DuckDB
75
- conn = duckdb.connect()
76
- query = f"""
77
- SELECT
78
- CASE
79
- WHEN overall_score < 30 THEN '0-29 (Poor)'
80
- WHEN overall_score < 50 THEN '30-49 (Fair)'
81
- WHEN overall_score < 70 THEN '50-69 (Good)'
82
- WHEN overall_score < 90 THEN '70-89 (Very Good)'
83
- ELSE '90-100 (Excellent)'
84
- END as score_range,
85
- COUNT(*) as count
86
- FROM '{CLARITY_ANALYSES_PATH}'
87
- GROUP BY score_range
88
- ORDER BY score_range
89
- """
90
-
91
- df = conn.execute(query).df()
92
- conn.close()
93
 
94
- if df.empty:
95
  return None
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  fig = px.bar(
98
  df,
99
  x='score_range',
@@ -108,38 +106,44 @@ def get_clarity_distribution():
108
 
109
  except Exception as e:
110
  logger.error(f"Error creating distribution chart: {e}")
 
 
111
  return None
112
 
113
 
114
  def get_content_timeline():
115
  """Get content published over time"""
116
  try:
117
- # Query parquet files directly with DuckDB
118
- conn = duckdb.connect()
119
- query = f"""
120
- SELECT
121
- DATE_TRUNC('day', c.published_at) as date,
122
- COUNT(*) as count,
123
- AVG(a.overall_score) as avg_score
124
- FROM '{CONTENT_ITEMS_PATH}' c
125
- LEFT JOIN '{CLARITY_ANALYSES_PATH}' a ON c.content_hash = a.content_hash
126
- WHERE c.published_at >= CURRENT_DATE - INTERVAL 30 DAY
127
- GROUP BY date
128
- ORDER BY date
129
- """
130
 
131
- df = conn.execute(query).df()
132
- conn.close()
 
 
133
 
134
  if df.empty:
135
  return None
136
 
 
 
 
 
 
 
 
 
137
  fig = go.Figure()
138
 
139
  # Add content count line
140
  fig.add_trace(go.Scatter(
141
- x=df['date'],
142
- y=df['count'],
143
  name='Items Published',
144
  yaxis='y1',
145
  line=dict(color='blue')
@@ -147,8 +151,8 @@ def get_content_timeline():
147
 
148
  # Add average clarity line
149
  fig.add_trace(go.Scatter(
150
- x=df['date'],
151
- y=df['avg_score'],
152
  name='Avg Clarity Score',
153
  yaxis='y2',
154
  line=dict(color='green')
@@ -174,28 +178,30 @@ def get_content_timeline():
174
  def get_category_scores():
175
  """Get average scores by category"""
176
  try:
177
- # Query parquet files directly with DuckDB
178
- conn = duckdb.connect()
179
- query = f"""
180
- SELECT
181
- c.category,
182
- COUNT(*) as count,
183
- AVG(a.overall_score) as avg_score
184
- FROM '{CONTENT_ITEMS_PATH}' c
185
- LEFT JOIN '{CLARITY_ANALYSES_PATH}' a ON c.content_hash = a.content_hash
186
- WHERE c.category IS NOT NULL AND c.category != ''
187
- GROUP BY c.category
188
- ORDER BY avg_score DESC
189
- """
190
 
191
- df = conn.execute(query).df()
192
- conn.close()
193
 
194
  if df.empty:
195
  return None
196
 
 
 
 
 
 
 
 
 
197
  fig = px.bar(
198
- df,
199
  y='category',
200
  x='avg_score',
201
  orientation='h',
@@ -327,25 +333,26 @@ def get_low_clarity_items(threshold=50):
327
  def export_data(format='csv'):
328
  """Export data to file"""
329
  try:
330
- # Query parquet files directly with DuckDB
331
- conn = duckdb.connect()
332
- query = f"""
333
- SELECT
334
- c.title,
335
- c.published_at,
336
- c.category,
337
- c.url,
338
- a.overall_score as clarity_score,
339
- a.readability_score,
340
- a.complexity_score,
341
- a.jargon_count
342
- FROM '{CONTENT_ITEMS_PATH}' c
343
- LEFT JOIN '{CLARITY_ANALYSES_PATH}' a ON c.content_hash = a.content_hash
344
- ORDER BY c.published_at DESC
345
- """
346
 
347
- df = conn.execute(query).df()
348
- conn.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
  # Save to file
351
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
@@ -604,10 +611,10 @@ with gr.Blocks(css=custom_css, title="Madrid Content Analyzer", theme=gr.themes.
604
  refresh_logs_btn = gr.Button("🔄 Refresh Logs")
605
 
606
  refresh_logs_btn.click(get_recent_logs, outputs=logs_display)
607
-
608
- # Load initial data
609
- demo.load(get_database_stats, outputs=db_stats_display)
610
- demo.load(get_recent_logs, outputs=logs_display)
611
 
612
  # Footer
613
  gr.Markdown("""
 
18
  from apscheduler.schedulers.background import BackgroundScheduler
19
 
20
  # Import our modules
21
+ from config.database import init_storage, CONTENT_ITEMS_PATH, CLARITY_ANALYSES_PATH, FETCH_LOGS_PATH, get_sources
22
  from storage.repository import ContentRepository
23
  from schedulers.background_tasks import fetch_and_analyze_content
24
  from utils.logger import setup_logging
 
25
 
26
  # Setup
27
  setup_logging()
 
70
  def get_clarity_distribution():
71
  """Get clarity score distribution chart"""
72
  try:
73
+ # Use pandas directly to avoid DuckDB segfaults
74
+ df_analyses = pd.read_parquet(CLARITY_ANALYSES_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ if df_analyses.empty:
77
  return None
78
 
79
+ # Create score ranges
80
+ def score_to_range(score):
81
+ if score < 30:
82
+ return '0-29 (Poor)'
83
+ elif score < 50:
84
+ return '30-49 (Fair)'
85
+ elif score < 70:
86
+ return '50-69 (Good)'
87
+ elif score < 90:
88
+ return '70-89 (Very Good)'
89
+ else:
90
+ return '90-100 (Excellent)'
91
+
92
+ df_analyses['score_range'] = df_analyses['overall_score'].apply(score_to_range)
93
+ df = df_analyses.groupby('score_range').size().reset_index(name='count')
94
+
95
  fig = px.bar(
96
  df,
97
  x='score_range',
 
106
 
107
  except Exception as e:
108
  logger.error(f"Error creating distribution chart: {e}")
109
+ import traceback
110
+ traceback.print_exc()
111
  return None
112
 
113
 
114
  def get_content_timeline():
115
  """Get content published over time"""
116
  try:
117
+ # Use pandas directly to avoid DuckDB segfaults
118
+ df_content = pd.read_parquet(CONTENT_ITEMS_PATH)
119
+ df_analyses = pd.read_parquet(CLARITY_ANALYSES_PATH)
120
+
121
+ # Merge
122
+ df = df_content.merge(df_analyses[['content_hash', 'overall_score']],
123
+ on='content_hash', how='left')
 
 
 
 
 
 
124
 
125
+ # Filter last 30 days
126
+ df['published_at'] = pd.to_datetime(df['published_at'])
127
+ cutoff = datetime.utcnow() - timedelta(days=30)
128
+ df = df[df['published_at'] >= cutoff]
129
 
130
  if df.empty:
131
  return None
132
 
133
+ # Group by date
134
+ df['date'] = df['published_at'].dt.date
135
+ grouped = df.groupby('date').agg({
136
+ 'content_hash': 'count',
137
+ 'overall_score': 'mean'
138
+ }).reset_index()
139
+ grouped.columns = ['date', 'count', 'avg_score']
140
+
141
  fig = go.Figure()
142
 
143
  # Add content count line
144
  fig.add_trace(go.Scatter(
145
+ x=grouped['date'],
146
+ y=grouped['count'],
147
  name='Items Published',
148
  yaxis='y1',
149
  line=dict(color='blue')
 
151
 
152
  # Add average clarity line
153
  fig.add_trace(go.Scatter(
154
+ x=grouped['date'],
155
+ y=grouped['avg_score'],
156
  name='Avg Clarity Score',
157
  yaxis='y2',
158
  line=dict(color='green')
 
178
  def get_category_scores():
179
  """Get average scores by category"""
180
  try:
181
+ # Use pandas directly to avoid DuckDB segfaults
182
+ df_content = pd.read_parquet(CONTENT_ITEMS_PATH)
183
+ df_analyses = pd.read_parquet(CLARITY_ANALYSES_PATH)
184
+
185
+ # Merge
186
+ df = df_content.merge(df_analyses[['content_hash', 'overall_score']],
187
+ on='content_hash', how='left')
 
 
 
 
 
 
188
 
189
+ # Filter out empty categories
190
+ df = df[(df['category'].notna()) & (df['category'] != '')]
191
 
192
  if df.empty:
193
  return None
194
 
195
+ # Group by category
196
+ grouped = df.groupby('category').agg({
197
+ 'content_hash': 'count',
198
+ 'overall_score': 'mean'
199
+ }).reset_index()
200
+ grouped.columns = ['category', 'count', 'avg_score']
201
+ grouped = grouped.sort_values('avg_score', ascending=False)
202
+
203
  fig = px.bar(
204
+ grouped,
205
  y='category',
206
  x='avg_score',
207
  orientation='h',
 
333
  def export_data(format='csv'):
334
  """Export data to file"""
335
  try:
336
+ # Use pandas directly to avoid DuckDB segfaults
337
+ df_content = pd.read_parquet(CONTENT_ITEMS_PATH)
338
+ df_analyses = pd.read_parquet(CLARITY_ANALYSES_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ # Merge
341
+ df = df_content.merge(
342
+ df_analyses[['content_hash', 'overall_score', 'readability_score',
343
+ 'complexity_score', 'jargon_count']],
344
+ on='content_hash',
345
+ how='left'
346
+ )
347
+
348
+ # Select and rename columns
349
+ df = df[['title', 'published_at', 'category', 'url',
350
+ 'overall_score', 'readability_score', 'complexity_score', 'jargon_count']]
351
+ df.columns = ['title', 'published_at', 'category', 'url',
352
+ 'clarity_score', 'readability_score', 'complexity_score', 'jargon_count']
353
+
354
+ # Sort
355
+ df = df.sort_values('published_at', ascending=False)
356
 
357
  # Save to file
358
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
 
611
  refresh_logs_btn = gr.Button("🔄 Refresh Logs")
612
 
613
  refresh_logs_btn.click(get_recent_logs, outputs=logs_display)
614
+
615
+ # Load initial data - commented out to avoid crashes
616
+ # demo.load(get_database_stats, outputs=db_stats_display)
617
+ # demo.load(get_recent_logs, outputs=logs_display)
618
 
619
  # Footer
620
  gr.Markdown("""