Spaces:
Running
Running
File size: 6,530 Bytes
e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c e45ddff c89f56c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | import os
import pandas as pd
from werkzeug.utils import secure_filename
from utils.nlp_utils import preprocess_text, analyze_sentiment
from models import db, Feedback, Upload
from flask import current_app
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in current_app.config['ALLOWED_EXTENSIONS']
def identify_text_column(df):
"""
Heuristic to find the column most likely containing the feedback text.
Looks for keywords or selects the column with the longest average string length.
"""
# Lowercase column names for easier checking
col_names = [str(c).lower().strip() for c in df.columns]
# Expanded heuristic matching for academic, social media, and product reviews
keywords = [
'review_text', 'feedback_text', 'text', 'feedback', 'review',
'comment', 'description', 'message', 'body', 'content', 'tweet', 'post'
]
for kw in keywords:
for i, col in enumerate(col_names):
if kw == col or kw in col:
return df.columns[i]
# Fallback: Find column with string type and highest max length
text_cols = df.select_dtypes(include=['object', 'string']).columns
if len(text_cols) == 0:
return None
max_len_col = text_cols[0]
max_len = 0
for col in text_cols:
# Get mean length of strings in this column (sample first 50 rows for speed)
avg_len = df[col].astype(str).head(50).apply(len).mean()
if avg_len > max_len:
max_len = avg_len
max_len_col = col
return max_len_col
import threading
def process_uploaded_file_async(app, filepath, upload_record_id, user_id, selected_column=None):
"""
Background worker function that runs outside the active request thread.
"""
with app.app_context():
try:
# Read file
if filepath.endswith('.csv'):
df = pd.read_csv(filepath)
elif filepath.endswith(('.xls', '.xlsx')):
df = pd.read_excel(filepath)
else:
_mark_upload_status(upload_record_id, 'Failed', "Unsupported file format")
return
if df.empty:
_mark_upload_status(upload_record_id, 'Failed', "The uploaded file is empty")
return
# Determine target text column
text_col = selected_column if selected_column and selected_column in df.columns else identify_text_column(df)
if not text_col:
_mark_upload_status(upload_record_id, 'Failed', "Could not identify a text column")
return
# Look for a department/category column if it exists
dept_col = None
dept_keywords = [
'department', 'category', 'product', 'dept', 'course',
'branch', 'faculty', 'subject', 'program', 'unit'
]
for col in df.columns:
if any(kw in str(col).lower().strip() for kw in dept_keywords):
dept_col = col
break
# Drop NA from the text column to avoid processing empties
df = df.dropna(subset=[text_col])
total_rows = len(df)
# Update upload tally to track maximum rows early
upload_record = Upload.query.get(upload_record_id)
if upload_record:
upload_record.total_rows = total_rows
db.session.commit()
feedbacks = []
for index, row in df.iterrows():
orig_text = str(row[text_col])
# Skip very empty rows
if not orig_text.strip() or orig_text.lower() == 'nan':
continue
cleaned_text = preprocess_text(orig_text)
sentiment, score = analyze_sentiment(cleaned_text)
department = str(row[dept_col]) if dept_col and pd.notna(row[dept_col]) else None
feedback = Feedback(
user_id=user_id,
upload_id=upload_record_id,
original_text=orig_text,
cleaned_text=cleaned_text,
sentiment=sentiment,
sentiment_score=score,
department_category=department
)
feedbacks.append(feedback)
# Commit in chunks of 50 to show live progress and conserve memory
if len(feedbacks) >= 50:
db.session.bulk_save_objects(feedbacks)
upload_record = Upload.query.get(upload_record_id)
if upload_record:
upload_record.processed_rows += len(feedbacks)
db.session.commit()
feedbacks = []
# Save remaining
if feedbacks:
db.session.bulk_save_objects(feedbacks)
upload_record = Upload.query.get(upload_record_id)
if upload_record:
upload_record.processed_rows += len(feedbacks)
db.session.commit()
# Mark as totally completed
upload_record = Upload.query.get(upload_record_id)
if upload_record:
upload_record.status = 'Completed'
db.session.commit()
except Exception as e:
db.session.rollback()
_mark_upload_status(upload_record_id, 'Failed', str(e))
def _mark_upload_status(upload_id, status, error_msg=None):
upload_record = Upload.query.get(upload_id)
if upload_record:
upload_record.status = 'Failed'
db.session.commit()
if error_msg:
print(f"Upload {upload_id} failed: {error_msg}")
def process_uploaded_file(filepath, upload_record_id, user_id, selected_column=None):
"""
Entry point for the web route. Instantly spawns a background thread and returns to not block the browser.
"""
app = current_app._get_current_object()
thread = threading.Thread(
target=process_uploaded_file_async,
args=(app, filepath, upload_record_id, user_id, selected_column)
)
thread.daemon = True # Allows server to shut down freely
thread.start()
return True, "File uploaded successfully. Neural Network is processing rows in the background!"
|