Spaces:

iyadsultan
/

human_evaluator

Running

App Files Files Community

iyadsultan commited on Mar 7, 2025

Commit

ac86d10

1 Parent(s): 9d060d5

6th

Browse files

Files changed (1) hide show

app.py +137 -212

app.py CHANGED Viewed

@@ -67,143 +67,39 @@ def load_documents():
     try:
         file_path = os.path.join(DATA_DIR, 'documents.csv')
         if not os.path.exists(file_path):
-            log_error(f"File not found: {file_path}")
             return []
-        # Try to detect file encoding
         try:
             encoding = detect_encoding(file_path)
             log_error(f"Detected encoding: {encoding}")
-        except Exception as e:
-            encoding = 'utf-8'  # Default to UTF-8 if detection fails
-            log_error(f"Failed to detect encoding, using utf-8: {str(e)}")
-        # Try multiple parsing approaches
-        try:
-            # First attempt: Use pandas with standard settings
             df = pd.read_csv(file_path, encoding=encoding)
             log_error("Successfully parsed CSV with standard settings")
-        except Exception as e1:
-            log_error(f"First parsing attempt failed: {str(e1)}")
-            try:
-                # Second attempt: Use maximum quoting
-                df = pd.read_csv(
-                    file_path,
-                    encoding=encoding,
-                    quoting=csv.QUOTE_ALL,
-                    escapechar='\\'
-                )
-                log_error("Successfully parsed CSV with QUOTE_ALL")
-            except Exception as e2:
-                log_error(f"Second parsing attempt failed: {str(e2)}")
-                try:
-                    # Third attempt: Use minimal quoting
-                    df = pd.read_csv(
-                        file_path,
-                        encoding=encoding,
-                        quoting=csv.QUOTE_MINIMAL,
-                        escapechar='\\',
-                        error_bad_lines=False  # Skip bad lines
-                    )
-                    log_error("Successfully parsed CSV with QUOTE_MINIMAL")
-                except Exception as e3:
-                    log_error(f"Third parsing attempt failed: {str(e3)}")
-                    # Final attempt: Use Python's CSV module directly
-                    log_error("Attempting to parse with CSV module directly")
-                    rows = []
-                    headers = None
-                    try:
-                        with open(file_path, 'r', encoding=encoding, newline='') as f:
-                            reader = csv.reader(f)
-                            headers = next(reader)
-                            for row in reader:
-                                if len(row) >= 4:  # Ensure we have at least 4 columns
-                                    rows.append(row)
-                        if not headers or not rows:
-                            log_error("No valid data or headers found in CSV")
-                            return []
-                        df = pd.DataFrame(rows, columns=headers)
-                        log_error(f"Successfully parsed with CSV module. Found {len(rows)} rows.")
-                    except Exception as e4:
-                        log_error(f"All parsing attempts failed: {str(e4)}")
-                        return []
-        # Log dataframe information to help debugging
-        log_error(f"DataFrame columns: {df.columns.tolist()}")
-        log_error(f"DataFrame shape: {df.shape}")
-        # Ensure we have the required columns
-        required_columns = ['filename', 'description', 'mrn', 'note']
-        missing_columns = [col for col in required_columns if col not in df.columns]
-        if missing_columns:
-            log_error(f"Missing required columns: {missing_columns}")
-            # Try to handle common column name variations
-            column_map = {}
-            for req_col in missing_columns:
-                # Check for case-insensitive matches
-                matches = [col for col in df.columns if col.lower() == req_col.lower()]
-                if matches:
-                    column_map[matches[0]] = req_col
-            # Rename columns if matches found
-            if column_map:
-                df = df.rename(columns=column_map)
-                log_error(f"Renamed columns: {column_map}")
-                # Check again for missing columns
-                missing_columns = [col for col in required_columns if col not in df.columns]
-        if missing_columns:
-            # If still missing columns, try to infer them from position
-            if len(df.columns) >= 4 and len(missing_columns) <= 4:
-                log_error("Attempting to infer columns by position")
-                new_columns = []
-                df_columns = df.columns.tolist()
-                for i, req_col in enumerate(required_columns):
-                    if i < len(df_columns):
-                        if req_col in df.columns:
-                            new_columns.append(req_col)
-                        else:
-                            new_columns.append(req_col)
-                            # Rename the column
-                            old_col = df_columns[i]
-                            column_map[old_col] = req_col
-                if column_map:
-                    df = df.rename(columns=column_map)
-                    log_error(f"Inferred columns by position: {column_map}")
-        # Final check for required columns
-        missing_columns = [col for col in required_columns if col not in df.columns]
-        if missing_columns:
-            log_error(f"Still missing required columns after all attempts: {missing_columns}")
-            raise ValueError(f"Missing required columns in documents.csv: {', '.join(missing_columns)}")
-        # If description is empty, replace with empty string
-        if 'description' in df.columns:
-            df['description'] = df['description'].fillna('')
-        # If MRN is empty, replace with empty string
-        if 'mrn' in df.columns:
-            df['mrn'] = df['mrn'].fillna('')
-        # Convert all columns to string to ensure compatability
         for col in df.columns:
             df[col] = df[col].astype(str)
-        # Log first few rows to help debugging
-        log_error(f"First row: {df.iloc[0].to_dict() if len(df) > 0 else 'No rows'}")
-        # Load evaluated documents
         try:
-            evaluations_path = os.path.join(DATA_DIR, 'evaluations.csv')
-            if os.path.exists(evaluations_path):
-                evaluations_df = pd.read_csv(evaluations_path)
-                # Get unique document titles that have been evaluated
                 evaluated_titles = evaluations_df['document_title'].unique()
                 # Filter out documents that have already been evaluated
                 df = df[~df['filename'].isin(evaluated_titles)]
@@ -213,20 +109,14 @@ def load_documents():
         except Exception as e:
             log_error(f"Error loading evaluations: {str(e)}")
             # If error, assume no evaluations
-            pass
         documents = df.to_dict('records')
         log_error(f"Returning {len(documents)} documents for evaluation")
         return documents
-    except FileNotFoundError:
-        log_error(f"The documents.csv file was not found in {DATA_DIR}")
-        flash("The documents.csv file was not found. Please create this file with your documents for evaluation.")
-        return []
     except Exception as e:
-        error_msg = f"Error loading documents: {str(e)}"
-        log_error(error_msg)
-        flash(f"{error_msg}. Please check the format of your documents.csv file.")
-        traceback.print_exc()  # Print the full traceback for debugging
         return []
 def save_evaluation(data):
@@ -279,91 +169,88 @@ def get_evaluated_document_count():
     except Exception:
         return 0
 @app.route('/', methods=['GET', 'POST'])
 def index():
-    """Landing page with file upload and evaluator name."""
     if request.method == 'POST':
         # Ensure data directory exists
         ensure_data_directory()
-        # Check if the post request has the file part
-        if 'file' not in request.files:
-            error_msg = 'No file part in the request. Please try again.'
-            log_error(error_msg)
-            flash(error_msg)
-            return redirect(request.url)
-        file = request.files['file']
-        evaluator_name = request.form.get('evaluator_name', '').strip()
-        # If user does not select file, browser also
-        # submits an empty part without filename
-        if file.filename == '':
-            error_msg = 'No file selected. Please select a file to upload.'
-            log_error(error_msg)
-            flash(error_msg)
-            return redirect(request.url)
         if not evaluator_name:
-            error_msg = 'Please enter your name as the evaluator.'
-            log_error(error_msg)
-            flash(error_msg)
-            return redirect(request.url)
-        # Store evaluator name in session
-        session['evaluator_name'] = evaluator_name
-        if file:
-            try:
-                # Instead of overwriting documents.csv directly, save to a temp file first
-                temp_path = os.path.join(DATA_DIR, 'temp_upload.csv')
-                file.save(temp_path)
-                # Then try to read it using pandas to validate
-                df = load_and_validate_csv(temp_path)
-                # If successful, move the temp file to documents.csv
-                documents_path = os.path.join(DATA_DIR, 'documents.csv')
-                shutil.move(temp_path, documents_path)
-                # Check file size and contents
-                file_size = os.path.getsize(documents_path)
-                log_error(f"File size: {file_size} bytes")
-                if file_size == 0:
-                    error_msg = 'The uploaded file is empty. Please check your file and try again.'
-                    log_error(error_msg)
-                    flash(error_msg)
-                    return redirect(url_for('index'))
-                # Preview the file contents
                 try:
-                    with open(documents_path, 'r', encoding='utf-8') as f:
-                        first_lines = [next(f) for _ in range(5) if f]
-                    log_error(f"File preview: {first_lines}")
                 except Exception as e:
-                    log_error(f"Could not preview file: {str(e)}")
-                # Try to load documents to verify the file format
-                docs = load_documents()
-                if not docs:
-                    error_msg = 'No valid documents found in the uploaded file. Please check the file format and try again.'
-                    log_error(error_msg)
-                    flash(error_msg)
-                    return redirect(url_for('index'))
-                # Success - redirect to the evaluation page
-                log_error(f"Successfully loaded {len(docs)} documents, redirecting to evaluate page")
-                return redirect(url_for('evaluate'))
-            except Exception as e:
-                error_msg = f'Error during file upload: {str(e)}'
-                log_error(error_msg)
-                flash(f'{error_msg}. Please try again.')
-                traceback.print_exc()  # Print the full traceback for debugging
-                return redirect(url_for('index'))
     return render_template('index.html')
 @app.route('/evaluate', methods=['GET', 'POST'])
@@ -487,18 +374,30 @@ def export_csv():
 @app.route('/debug')
 def debug():
     """Debug page showing application state."""
     # Get documents
     documents = load_documents()
-    # Get evaluations
     eval_df, _, _ = get_results()
-    evaluations = eval_df.to_dict('records') if not eval_df.empty else []
     return render_template('debug.html',
         documents=documents,
         evaluations=evaluations,
-        documents_exists=os.path.exists(os.path.join(DATA_DIR, 'documents.csv')),
-        evaluations_exists=os.path.exists(os.path.join(DATA_DIR, 'evaluations.csv')),
         errors=ERROR_LOG
     )
@@ -546,10 +445,36 @@ def reset():
     flash('Session reset. You can start a new evaluation.')
     return redirect(url_for('index'))
 if __name__ == '__main__':
-    # Set up the data directory and initial files
     ensure_data_directory()
     copy_template_if_needed()
     # Run the app
     app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))

     try:
         file_path = os.path.join(DATA_DIR, 'documents.csv')
+        # Check if file exists
         if not os.path.exists(file_path):
+            log_error(f"Documents file not found at {file_path}")
             return []
         try:
+            # Try to detect encoding
             encoding = detect_encoding(file_path)
             log_error(f"Detected encoding: {encoding}")
+            # Try to read with pandas using the detected encoding
             df = pd.read_csv(file_path, encoding=encoding)
             log_error("Successfully parsed CSV with standard settings")
+        except Exception as e:
+            log_error(f"Error parsing CSV: {str(e)}")
+            return []
+        # Convert columns to string to ensure compatibility
         for col in df.columns:
             df[col] = df[col].astype(str)
+        # Log some stats
+        log_error(f"DataFrame columns: {list(df.columns)}")
+        log_error(f"DataFrame shape: {df.shape}")
+        if not df.empty:
+            log_error(f"First row: {df.iloc[0].to_dict()}")
+        # Check for evaluations file to exclude already evaluated documents
+        eval_path = os.path.join(DATA_DIR, 'evaluations.csv')
         try:
+            if os.path.exists(eval_path):
+                evaluations_df = pd.read_csv(eval_path)
                 evaluated_titles = evaluations_df['document_title'].unique()
                 # Filter out documents that have already been evaluated
                 df = df[~df['filename'].isin(evaluated_titles)]
         except Exception as e:
             log_error(f"Error loading evaluations: {str(e)}")
             # If error, assume no evaluations
+        # Convert to list of dictionaries
         documents = df.to_dict('records')
         log_error(f"Returning {len(documents)} documents for evaluation")
         return documents
     except Exception as e:
+        log_error(f"Error in load_documents: {str(e)}")
         return []
 def save_evaluation(data):
     except Exception:
         return 0
+def load_and_validate_csv(file_path):
+    """Load and validate a CSV file to ensure it has the required format"""
+    try:
+        # Try to detect encoding
+        encoding = detect_encoding(file_path)
+        log_error(f"Detected encoding: {encoding}")
+        # Try to read with pandas using the detected encoding
+        df = pd.read_csv(file_path, encoding=encoding)
+        log_error("Successfully parsed CSV with standard settings")
+        # Check for required columns
+        required_columns = ['filename', 'description', 'mrn', 'note']
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            log_error(f"Missing required columns: {missing_columns}")
+            raise ValueError(f"Missing required columns: {missing_columns}")
+        # Log success information
+        log_error(f"DataFrame columns: {list(df.columns)}")
+        log_error(f"DataFrame shape: {df.shape}")
+        if not df.empty:
+            log_error(f"First row: {df.iloc[0].to_dict()}")
+        return df
+    except Exception as e:
+        log_error(f"Error validating CSV file: {str(e)}")
+        raise
 @app.route('/', methods=['GET', 'POST'])
 def index():
     if request.method == 'POST':
         # Ensure data directory exists
         ensure_data_directory()
+        # Get evaluator name
+        evaluator_name = request.form.get('evaluator_name', '')
         if not evaluator_name:
+            flash("Please enter your name as the evaluator.")
+            return render_template('index.html')
+        # Process file upload
+        if 'file' in request.files:
+            file = request.files['file']
+            if file.filename == '':
+                flash("No file selected.")
+                return render_template('index.html')
+            if file and '.' in file.filename and file.filename.rsplit('.', 1)[1].lower() == 'csv':
                 try:
+                    # Create a temporary file path
+                    temp_path = os.path.join(DATA_DIR, 'temp_upload.csv')
+                    # Save the uploaded file
+                    file.save(temp_path)
+                    # Validate the CSV format
+                    df = load_and_validate_csv(temp_path)
+                    # If valid, move to documents.csv
+                    documents_path = os.path.join(DATA_DIR, 'documents.csv')
+                    shutil.move(temp_path, documents_path)
+                    # Set session cookie
+                    session['evaluator_name'] = evaluator_name
+                    flash("File uploaded successfully!")
+                    return redirect(url_for('evaluate'))
                 except Exception as e:
+                    log_error(f"Error during file upload: {str(e)}")
+                    flash(f"Error during file upload: {str(e)}. Please try again.")
+            else:
+                flash("Please upload a CSV file.")
+        return render_template('index.html')
+    # Handle GET request
     return render_template('index.html')
 @app.route('/evaluate', methods=['GET', 'POST'])
 @app.route('/debug')
 def debug():
     """Debug page showing application state."""
+    # Check for data directory
+    if not os.path.exists(DATA_DIR):
+        try:
+            ensure_data_directory()
+        except Exception as e:
+            log_error(f"Failed to create data directory in debug route: {str(e)}")
     # Get documents
     documents = load_documents()
+    # Get evaluations - properly handle DataFrame
     eval_df, _, _ = get_results()
+    # Convert DataFrame to list of dictionaries if not empty
+    evaluations = [] if eval_df is None or eval_df.empty else eval_df.to_dict('records')
+    # Get file paths
+    docs_path = os.path.join(DATA_DIR, 'documents.csv')
+    evals_path = os.path.join(DATA_DIR, 'evaluations.csv')
     return render_template('debug.html',
         documents=documents,
         evaluations=evaluations,
+        documents_exists=os.path.exists(docs_path),
+        evaluations_exists=os.path.exists(evals_path),
         errors=ERROR_LOG
     )
     flash('Session reset. You can start a new evaluation.')
     return redirect(url_for('index'))
+def ensure_data_directory():
+    """Ensure data directory exists"""
+    try:
+        os.makedirs(DATA_DIR, exist_ok=True)
+        print(f"Created/verified data directory at {DATA_DIR}")
+    except Exception as e:
+        print(f"Error creating data directory: {str(e)}")
+def copy_template_if_needed():
+    """Copy template file to documents.csv if it doesn't exist"""
+    documents_path = os.path.join(DATA_DIR, 'documents.csv')
+    if not os.path.exists(documents_path):
+        try:
+            # Copy from template
+            template_path = 'sample_documents_template.csv'
+            if os.path.exists(template_path):
+                shutil.copy(template_path, documents_path)
+                print(f"Copied template to {documents_path}")
+        except Exception as e:
+            print(f"Error copying template: {str(e)}")
 if __name__ == '__main__':
+    print("\n===== Application Startup at", datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "=====\n")
+    # Create data directory
     ensure_data_directory()
+    # Copy template files if needed
     copy_template_if_needed()
     # Run the app
+    app.config['DEBUG'] = True
     app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))