import os import pandas as pd from werkzeug.utils import secure_filename from utils.nlp_utils import preprocess_text, analyze_sentiment from models import db, Feedback, Upload from flask import current_app def allowed_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in current_app.config['ALLOWED_EXTENSIONS'] def identify_text_column(df): """ Heuristic to find the column most likely containing the feedback text. Looks for keywords or selects the column with the longest average string length. """ # Lowercase column names for easier checking col_names = [str(c).lower().strip() for c in df.columns] # Expanded heuristic matching for academic, social media, and product reviews keywords = [ 'review_text', 'feedback_text', 'text', 'feedback', 'review', 'comment', 'description', 'message', 'body', 'content', 'tweet', 'post' ] for kw in keywords: for i, col in enumerate(col_names): if kw == col or kw in col: return df.columns[i] # Fallback: Find column with string type and highest max length text_cols = df.select_dtypes(include=['object', 'string']).columns if len(text_cols) == 0: return None max_len_col = text_cols[0] max_len = 0 for col in text_cols: # Get mean length of strings in this column (sample first 50 rows for speed) avg_len = df[col].astype(str).head(50).apply(len).mean() if avg_len > max_len: max_len = avg_len max_len_col = col return max_len_col import threading def process_uploaded_file_async(app, filepath, upload_record_id, user_id, selected_column=None): """ Background worker function that runs outside the active request thread. """ with app.app_context(): try: # Read file if filepath.endswith('.csv'): df = pd.read_csv(filepath) elif filepath.endswith(('.xls', '.xlsx')): df = pd.read_excel(filepath) else: _mark_upload_status(upload_record_id, 'Failed', "Unsupported file format") return if df.empty: _mark_upload_status(upload_record_id, 'Failed', "The uploaded file is empty") return # Determine target text column text_col = selected_column if selected_column and selected_column in df.columns else identify_text_column(df) if not text_col: _mark_upload_status(upload_record_id, 'Failed', "Could not identify a text column") return # Look for a department/category column if it exists dept_col = None dept_keywords = [ 'department', 'category', 'product', 'dept', 'course', 'branch', 'faculty', 'subject', 'program', 'unit' ] for col in df.columns: if any(kw in str(col).lower().strip() for kw in dept_keywords): dept_col = col break # Drop NA from the text column to avoid processing empties df = df.dropna(subset=[text_col]) total_rows = len(df) # Update upload tally to track maximum rows early upload_record = Upload.query.get(upload_record_id) if upload_record: upload_record.total_rows = total_rows db.session.commit() feedbacks = [] for index, row in df.iterrows(): orig_text = str(row[text_col]) # Skip very empty rows if not orig_text.strip() or orig_text.lower() == 'nan': continue cleaned_text = preprocess_text(orig_text) sentiment, score = analyze_sentiment(cleaned_text) department = str(row[dept_col]) if dept_col and pd.notna(row[dept_col]) else None feedback = Feedback( user_id=user_id, upload_id=upload_record_id, original_text=orig_text, cleaned_text=cleaned_text, sentiment=sentiment, sentiment_score=score, department_category=department ) feedbacks.append(feedback) # Commit in chunks of 50 to show live progress and conserve memory if len(feedbacks) >= 50: db.session.bulk_save_objects(feedbacks) upload_record = Upload.query.get(upload_record_id) if upload_record: upload_record.processed_rows += len(feedbacks) db.session.commit() feedbacks = [] # Save remaining if feedbacks: db.session.bulk_save_objects(feedbacks) upload_record = Upload.query.get(upload_record_id) if upload_record: upload_record.processed_rows += len(feedbacks) db.session.commit() # Mark as totally completed upload_record = Upload.query.get(upload_record_id) if upload_record: upload_record.status = 'Completed' db.session.commit() except Exception as e: db.session.rollback() _mark_upload_status(upload_record_id, 'Failed', str(e)) def _mark_upload_status(upload_id, status, error_msg=None): upload_record = Upload.query.get(upload_id) if upload_record: upload_record.status = 'Failed' db.session.commit() if error_msg: print(f"Upload {upload_id} failed: {error_msg}") def process_uploaded_file(filepath, upload_record_id, user_id, selected_column=None): """ Entry point for the web route. Instantly spawns a background thread and returns to not block the browser. """ app = current_app._get_current_object() thread = threading.Thread( target=process_uploaded_file_async, args=(app, filepath, upload_record_id, user_id, selected_column) ) thread.daemon = True # Allows server to shut down freely thread.start() return True, "File uploaded successfully. Neural Network is processing rows in the background!"