iyadsultan commited on
Commit
ac86d10
·
1 Parent(s): 9d060d5
Files changed (1) hide show
  1. app.py +137 -212
app.py CHANGED
@@ -67,143 +67,39 @@ def load_documents():
67
  try:
68
  file_path = os.path.join(DATA_DIR, 'documents.csv')
69
 
 
70
  if not os.path.exists(file_path):
71
- log_error(f"File not found: {file_path}")
72
  return []
73
-
74
- # Try to detect file encoding
75
  try:
 
76
  encoding = detect_encoding(file_path)
77
  log_error(f"Detected encoding: {encoding}")
78
- except Exception as e:
79
- encoding = 'utf-8' # Default to UTF-8 if detection fails
80
- log_error(f"Failed to detect encoding, using utf-8: {str(e)}")
81
-
82
- # Try multiple parsing approaches
83
- try:
84
- # First attempt: Use pandas with standard settings
85
  df = pd.read_csv(file_path, encoding=encoding)
86
  log_error("Successfully parsed CSV with standard settings")
87
- except Exception as e1:
88
- log_error(f"First parsing attempt failed: {str(e1)}")
89
- try:
90
- # Second attempt: Use maximum quoting
91
- df = pd.read_csv(
92
- file_path,
93
- encoding=encoding,
94
- quoting=csv.QUOTE_ALL,
95
- escapechar='\\'
96
- )
97
- log_error("Successfully parsed CSV with QUOTE_ALL")
98
- except Exception as e2:
99
- log_error(f"Second parsing attempt failed: {str(e2)}")
100
- try:
101
- # Third attempt: Use minimal quoting
102
- df = pd.read_csv(
103
- file_path,
104
- encoding=encoding,
105
- quoting=csv.QUOTE_MINIMAL,
106
- escapechar='\\',
107
- error_bad_lines=False # Skip bad lines
108
- )
109
- log_error("Successfully parsed CSV with QUOTE_MINIMAL")
110
- except Exception as e3:
111
- log_error(f"Third parsing attempt failed: {str(e3)}")
112
- # Final attempt: Use Python's CSV module directly
113
- log_error("Attempting to parse with CSV module directly")
114
- rows = []
115
- headers = None
116
- try:
117
- with open(file_path, 'r', encoding=encoding, newline='') as f:
118
- reader = csv.reader(f)
119
- headers = next(reader)
120
- for row in reader:
121
- if len(row) >= 4: # Ensure we have at least 4 columns
122
- rows.append(row)
123
-
124
- if not headers or not rows:
125
- log_error("No valid data or headers found in CSV")
126
- return []
127
-
128
- df = pd.DataFrame(rows, columns=headers)
129
- log_error(f"Successfully parsed with CSV module. Found {len(rows)} rows.")
130
- except Exception as e4:
131
- log_error(f"All parsing attempts failed: {str(e4)}")
132
- return []
133
-
134
- # Log dataframe information to help debugging
135
- log_error(f"DataFrame columns: {df.columns.tolist()}")
136
- log_error(f"DataFrame shape: {df.shape}")
137
-
138
- # Ensure we have the required columns
139
- required_columns = ['filename', 'description', 'mrn', 'note']
140
- missing_columns = [col for col in required_columns if col not in df.columns]
141
-
142
- if missing_columns:
143
- log_error(f"Missing required columns: {missing_columns}")
144
- # Try to handle common column name variations
145
- column_map = {}
146
- for req_col in missing_columns:
147
- # Check for case-insensitive matches
148
- matches = [col for col in df.columns if col.lower() == req_col.lower()]
149
- if matches:
150
- column_map[matches[0]] = req_col
151
-
152
- # Rename columns if matches found
153
- if column_map:
154
- df = df.rename(columns=column_map)
155
- log_error(f"Renamed columns: {column_map}")
156
- # Check again for missing columns
157
- missing_columns = [col for col in required_columns if col not in df.columns]
158
-
159
- if missing_columns:
160
- # If still missing columns, try to infer them from position
161
- if len(df.columns) >= 4 and len(missing_columns) <= 4:
162
- log_error("Attempting to infer columns by position")
163
- new_columns = []
164
- df_columns = df.columns.tolist()
165
-
166
- for i, req_col in enumerate(required_columns):
167
- if i < len(df_columns):
168
- if req_col in df.columns:
169
- new_columns.append(req_col)
170
- else:
171
- new_columns.append(req_col)
172
- # Rename the column
173
- old_col = df_columns[i]
174
- column_map[old_col] = req_col
175
-
176
- if column_map:
177
- df = df.rename(columns=column_map)
178
- log_error(f"Inferred columns by position: {column_map}")
179
-
180
- # Final check for required columns
181
- missing_columns = [col for col in required_columns if col not in df.columns]
182
- if missing_columns:
183
- log_error(f"Still missing required columns after all attempts: {missing_columns}")
184
- raise ValueError(f"Missing required columns in documents.csv: {', '.join(missing_columns)}")
185
-
186
- # If description is empty, replace with empty string
187
- if 'description' in df.columns:
188
- df['description'] = df['description'].fillna('')
189
-
190
- # If MRN is empty, replace with empty string
191
- if 'mrn' in df.columns:
192
- df['mrn'] = df['mrn'].fillna('')
193
 
194
- # Convert all columns to string to ensure compatability
195
  for col in df.columns:
196
  df[col] = df[col].astype(str)
197
 
198
- # Log first few rows to help debugging
199
- log_error(f"First row: {df.iloc[0].to_dict() if len(df) > 0 else 'No rows'}")
 
 
 
 
200
 
201
- # Load evaluated documents
 
202
  try:
203
- evaluations_path = os.path.join(DATA_DIR, 'evaluations.csv')
204
- if os.path.exists(evaluations_path):
205
- evaluations_df = pd.read_csv(evaluations_path)
206
- # Get unique document titles that have been evaluated
207
  evaluated_titles = evaluations_df['document_title'].unique()
208
  # Filter out documents that have already been evaluated
209
  df = df[~df['filename'].isin(evaluated_titles)]
@@ -213,20 +109,14 @@ def load_documents():
213
  except Exception as e:
214
  log_error(f"Error loading evaluations: {str(e)}")
215
  # If error, assume no evaluations
216
- pass
217
 
 
218
  documents = df.to_dict('records')
219
  log_error(f"Returning {len(documents)} documents for evaluation")
220
  return documents
221
- except FileNotFoundError:
222
- log_error(f"The documents.csv file was not found in {DATA_DIR}")
223
- flash("The documents.csv file was not found. Please create this file with your documents for evaluation.")
224
- return []
225
  except Exception as e:
226
- error_msg = f"Error loading documents: {str(e)}"
227
- log_error(error_msg)
228
- flash(f"{error_msg}. Please check the format of your documents.csv file.")
229
- traceback.print_exc() # Print the full traceback for debugging
230
  return []
231
 
232
  def save_evaluation(data):
@@ -279,91 +169,88 @@ def get_evaluated_document_count():
279
  except Exception:
280
  return 0
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  @app.route('/', methods=['GET', 'POST'])
283
  def index():
284
- """Landing page with file upload and evaluator name."""
285
  if request.method == 'POST':
286
  # Ensure data directory exists
287
  ensure_data_directory()
288
 
289
- # Check if the post request has the file part
290
- if 'file' not in request.files:
291
- error_msg = 'No file part in the request. Please try again.'
292
- log_error(error_msg)
293
- flash(error_msg)
294
- return redirect(request.url)
295
-
296
- file = request.files['file']
297
- evaluator_name = request.form.get('evaluator_name', '').strip()
298
-
299
- # If user does not select file, browser also
300
- # submits an empty part without filename
301
- if file.filename == '':
302
- error_msg = 'No file selected. Please select a file to upload.'
303
- log_error(error_msg)
304
- flash(error_msg)
305
- return redirect(request.url)
306
-
307
  if not evaluator_name:
308
- error_msg = 'Please enter your name as the evaluator.'
309
- log_error(error_msg)
310
- flash(error_msg)
311
- return redirect(request.url)
312
-
313
- # Store evaluator name in session
314
- session['evaluator_name'] = evaluator_name
315
 
316
- if file:
317
- try:
318
- # Instead of overwriting documents.csv directly, save to a temp file first
319
- temp_path = os.path.join(DATA_DIR, 'temp_upload.csv')
320
- file.save(temp_path)
321
-
322
- # Then try to read it using pandas to validate
323
- df = load_and_validate_csv(temp_path)
324
-
325
- # If successful, move the temp file to documents.csv
326
- documents_path = os.path.join(DATA_DIR, 'documents.csv')
327
- shutil.move(temp_path, documents_path)
328
-
329
- # Check file size and contents
330
- file_size = os.path.getsize(documents_path)
331
- log_error(f"File size: {file_size} bytes")
332
-
333
- if file_size == 0:
334
- error_msg = 'The uploaded file is empty. Please check your file and try again.'
335
- log_error(error_msg)
336
- flash(error_msg)
337
- return redirect(url_for('index'))
338
-
339
- # Preview the file contents
340
  try:
341
- with open(documents_path, 'r', encoding='utf-8') as f:
342
- first_lines = [next(f) for _ in range(5) if f]
343
- log_error(f"File preview: {first_lines}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  except Exception as e:
345
- log_error(f"Could not preview file: {str(e)}")
346
-
347
- # Try to load documents to verify the file format
348
- docs = load_documents()
349
-
350
- if not docs:
351
- error_msg = 'No valid documents found in the uploaded file. Please check the file format and try again.'
352
- log_error(error_msg)
353
- flash(error_msg)
354
- return redirect(url_for('index'))
355
-
356
- # Success - redirect to the evaluation page
357
- log_error(f"Successfully loaded {len(docs)} documents, redirecting to evaluate page")
358
- return redirect(url_for('evaluate'))
359
-
360
- except Exception as e:
361
- error_msg = f'Error during file upload: {str(e)}'
362
- log_error(error_msg)
363
- flash(f'{error_msg}. Please try again.')
364
- traceback.print_exc() # Print the full traceback for debugging
365
- return redirect(url_for('index'))
366
 
 
367
  return render_template('index.html')
368
 
369
  @app.route('/evaluate', methods=['GET', 'POST'])
@@ -487,18 +374,30 @@ def export_csv():
487
  @app.route('/debug')
488
  def debug():
489
  """Debug page showing application state."""
 
 
 
 
 
 
 
490
  # Get documents
491
  documents = load_documents()
492
 
493
- # Get evaluations
494
  eval_df, _, _ = get_results()
495
- evaluations = eval_df.to_dict('records') if not eval_df.empty else []
 
 
 
 
 
496
 
497
  return render_template('debug.html',
498
  documents=documents,
499
  evaluations=evaluations,
500
- documents_exists=os.path.exists(os.path.join(DATA_DIR, 'documents.csv')),
501
- evaluations_exists=os.path.exists(os.path.join(DATA_DIR, 'evaluations.csv')),
502
  errors=ERROR_LOG
503
  )
504
 
@@ -546,10 +445,36 @@ def reset():
546
  flash('Session reset. You can start a new evaluation.')
547
  return redirect(url_for('index'))
548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
  if __name__ == '__main__':
550
- # Set up the data directory and initial files
 
 
551
  ensure_data_directory()
 
 
552
  copy_template_if_needed()
553
 
554
  # Run the app
 
555
  app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))
 
67
  try:
68
  file_path = os.path.join(DATA_DIR, 'documents.csv')
69
 
70
+ # Check if file exists
71
  if not os.path.exists(file_path):
72
+ log_error(f"Documents file not found at {file_path}")
73
  return []
74
+
 
75
  try:
76
+ # Try to detect encoding
77
  encoding = detect_encoding(file_path)
78
  log_error(f"Detected encoding: {encoding}")
79
+
80
+ # Try to read with pandas using the detected encoding
 
 
 
 
 
81
  df = pd.read_csv(file_path, encoding=encoding)
82
  log_error("Successfully parsed CSV with standard settings")
83
+ except Exception as e:
84
+ log_error(f"Error parsing CSV: {str(e)}")
85
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Convert columns to string to ensure compatibility
88
  for col in df.columns:
89
  df[col] = df[col].astype(str)
90
 
91
+ # Log some stats
92
+ log_error(f"DataFrame columns: {list(df.columns)}")
93
+ log_error(f"DataFrame shape: {df.shape}")
94
+
95
+ if not df.empty:
96
+ log_error(f"First row: {df.iloc[0].to_dict()}")
97
 
98
+ # Check for evaluations file to exclude already evaluated documents
99
+ eval_path = os.path.join(DATA_DIR, 'evaluations.csv')
100
  try:
101
+ if os.path.exists(eval_path):
102
+ evaluations_df = pd.read_csv(eval_path)
 
 
103
  evaluated_titles = evaluations_df['document_title'].unique()
104
  # Filter out documents that have already been evaluated
105
  df = df[~df['filename'].isin(evaluated_titles)]
 
109
  except Exception as e:
110
  log_error(f"Error loading evaluations: {str(e)}")
111
  # If error, assume no evaluations
 
112
 
113
+ # Convert to list of dictionaries
114
  documents = df.to_dict('records')
115
  log_error(f"Returning {len(documents)} documents for evaluation")
116
  return documents
117
+
 
 
 
118
  except Exception as e:
119
+ log_error(f"Error in load_documents: {str(e)}")
 
 
 
120
  return []
121
 
122
  def save_evaluation(data):
 
169
  except Exception:
170
  return 0
171
 
172
+ def load_and_validate_csv(file_path):
173
+ """Load and validate a CSV file to ensure it has the required format"""
174
+ try:
175
+ # Try to detect encoding
176
+ encoding = detect_encoding(file_path)
177
+ log_error(f"Detected encoding: {encoding}")
178
+
179
+ # Try to read with pandas using the detected encoding
180
+ df = pd.read_csv(file_path, encoding=encoding)
181
+ log_error("Successfully parsed CSV with standard settings")
182
+
183
+ # Check for required columns
184
+ required_columns = ['filename', 'description', 'mrn', 'note']
185
+ missing_columns = [col for col in required_columns if col not in df.columns]
186
+
187
+ if missing_columns:
188
+ log_error(f"Missing required columns: {missing_columns}")
189
+ raise ValueError(f"Missing required columns: {missing_columns}")
190
+
191
+ # Log success information
192
+ log_error(f"DataFrame columns: {list(df.columns)}")
193
+ log_error(f"DataFrame shape: {df.shape}")
194
+
195
+ if not df.empty:
196
+ log_error(f"First row: {df.iloc[0].to_dict()}")
197
+
198
+ return df
199
+
200
+ except Exception as e:
201
+ log_error(f"Error validating CSV file: {str(e)}")
202
+ raise
203
+
204
  @app.route('/', methods=['GET', 'POST'])
205
  def index():
 
206
  if request.method == 'POST':
207
  # Ensure data directory exists
208
  ensure_data_directory()
209
 
210
+ # Get evaluator name
211
+ evaluator_name = request.form.get('evaluator_name', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  if not evaluator_name:
213
+ flash("Please enter your name as the evaluator.")
214
+ return render_template('index.html')
 
 
 
 
 
215
 
216
+ # Process file upload
217
+ if 'file' in request.files:
218
+ file = request.files['file']
219
+
220
+ if file.filename == '':
221
+ flash("No file selected.")
222
+ return render_template('index.html')
223
+
224
+ if file and '.' in file.filename and file.filename.rsplit('.', 1)[1].lower() == 'csv':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  try:
226
+ # Create a temporary file path
227
+ temp_path = os.path.join(DATA_DIR, 'temp_upload.csv')
228
+
229
+ # Save the uploaded file
230
+ file.save(temp_path)
231
+
232
+ # Validate the CSV format
233
+ df = load_and_validate_csv(temp_path)
234
+
235
+ # If valid, move to documents.csv
236
+ documents_path = os.path.join(DATA_DIR, 'documents.csv')
237
+ shutil.move(temp_path, documents_path)
238
+
239
+ # Set session cookie
240
+ session['evaluator_name'] = evaluator_name
241
+
242
+ flash("File uploaded successfully!")
243
+ return redirect(url_for('evaluate'))
244
+
245
  except Exception as e:
246
+ log_error(f"Error during file upload: {str(e)}")
247
+ flash(f"Error during file upload: {str(e)}. Please try again.")
248
+ else:
249
+ flash("Please upload a CSV file.")
250
+
251
+ return render_template('index.html')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ # Handle GET request
254
  return render_template('index.html')
255
 
256
  @app.route('/evaluate', methods=['GET', 'POST'])
 
374
  @app.route('/debug')
375
  def debug():
376
  """Debug page showing application state."""
377
+ # Check for data directory
378
+ if not os.path.exists(DATA_DIR):
379
+ try:
380
+ ensure_data_directory()
381
+ except Exception as e:
382
+ log_error(f"Failed to create data directory in debug route: {str(e)}")
383
+
384
  # Get documents
385
  documents = load_documents()
386
 
387
+ # Get evaluations - properly handle DataFrame
388
  eval_df, _, _ = get_results()
389
+ # Convert DataFrame to list of dictionaries if not empty
390
+ evaluations = [] if eval_df is None or eval_df.empty else eval_df.to_dict('records')
391
+
392
+ # Get file paths
393
+ docs_path = os.path.join(DATA_DIR, 'documents.csv')
394
+ evals_path = os.path.join(DATA_DIR, 'evaluations.csv')
395
 
396
  return render_template('debug.html',
397
  documents=documents,
398
  evaluations=evaluations,
399
+ documents_exists=os.path.exists(docs_path),
400
+ evaluations_exists=os.path.exists(evals_path),
401
  errors=ERROR_LOG
402
  )
403
 
 
445
  flash('Session reset. You can start a new evaluation.')
446
  return redirect(url_for('index'))
447
 
448
+ def ensure_data_directory():
449
+ """Ensure data directory exists"""
450
+ try:
451
+ os.makedirs(DATA_DIR, exist_ok=True)
452
+ print(f"Created/verified data directory at {DATA_DIR}")
453
+ except Exception as e:
454
+ print(f"Error creating data directory: {str(e)}")
455
+
456
+ def copy_template_if_needed():
457
+ """Copy template file to documents.csv if it doesn't exist"""
458
+ documents_path = os.path.join(DATA_DIR, 'documents.csv')
459
+ if not os.path.exists(documents_path):
460
+ try:
461
+ # Copy from template
462
+ template_path = 'sample_documents_template.csv'
463
+ if os.path.exists(template_path):
464
+ shutil.copy(template_path, documents_path)
465
+ print(f"Copied template to {documents_path}")
466
+ except Exception as e:
467
+ print(f"Error copying template: {str(e)}")
468
+
469
  if __name__ == '__main__':
470
+ print("\n===== Application Startup at", datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "=====\n")
471
+
472
+ # Create data directory
473
  ensure_data_directory()
474
+
475
+ # Copy template files if needed
476
  copy_template_if_needed()
477
 
478
  # Run the app
479
+ app.config['DEBUG'] = True
480
  app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))