File size: 6,530 Bytes
e45ddff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c89f56c
 
 
e45ddff
c89f56c
e45ddff
c89f56c
 
 
 
 
 
 
 
 
 
e45ddff
c89f56c
 
 
e45ddff
c89f56c
 
 
 
 
e45ddff
c89f56c
 
 
 
 
 
 
 
 
 
e45ddff
c89f56c
 
 
e45ddff
c89f56c
 
 
 
 
 
 
e45ddff
c89f56c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e45ddff
c89f56c
 
 
 
 
e45ddff
c89f56c
 
 
 
 
 
 
 
e45ddff
c89f56c
 
e45ddff
c89f56c
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import pandas as pd
from werkzeug.utils import secure_filename
from utils.nlp_utils import preprocess_text, analyze_sentiment
from models import db, Feedback, Upload
from flask import current_app

def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in current_app.config['ALLOWED_EXTENSIONS']

def identify_text_column(df):
    """
    Heuristic to find the column most likely containing the feedback text.
    Looks for keywords or selects the column with the longest average string length.
    """
    # Lowercase column names for easier checking
    col_names = [str(c).lower().strip() for c in df.columns]
    
    # Expanded heuristic matching for academic, social media, and product reviews
    keywords = [
        'review_text', 'feedback_text', 'text', 'feedback', 'review', 
        'comment', 'description', 'message', 'body', 'content', 'tweet', 'post'
    ]
    for kw in keywords:
        for i, col in enumerate(col_names):
            if kw == col or kw in col:
                return df.columns[i]
    
    # Fallback: Find column with string type and highest max length
    text_cols = df.select_dtypes(include=['object', 'string']).columns
    if len(text_cols) == 0:
        return None
        
    max_len_col = text_cols[0]
    max_len = 0
    
    for col in text_cols:
        # Get mean length of strings in this column (sample first 50 rows for speed)
        avg_len = df[col].astype(str).head(50).apply(len).mean()
        if avg_len > max_len:
            max_len = avg_len
            max_len_col = col
            
    return max_len_col

import threading

def process_uploaded_file_async(app, filepath, upload_record_id, user_id, selected_column=None):
    """
    Background worker function that runs outside the active request thread.
    """
    with app.app_context():
        try:
            # Read file
            if filepath.endswith('.csv'):
                df = pd.read_csv(filepath)
            elif filepath.endswith(('.xls', '.xlsx')):
                df = pd.read_excel(filepath)
            else:
                _mark_upload_status(upload_record_id, 'Failed', "Unsupported file format")
                return

            if df.empty:
                _mark_upload_status(upload_record_id, 'Failed', "The uploaded file is empty")
                return

            # Determine target text column
            text_col = selected_column if selected_column and selected_column in df.columns else identify_text_column(df)
            if not text_col:
                _mark_upload_status(upload_record_id, 'Failed', "Could not identify a text column")
                return

            # Look for a department/category column if it exists
            dept_col = None
            dept_keywords = [
                'department', 'category', 'product', 'dept', 'course', 
                'branch', 'faculty', 'subject', 'program', 'unit'
            ]
            for col in df.columns:
                if any(kw in str(col).lower().strip() for kw in dept_keywords):
                    dept_col = col
                    break

            # Drop NA from the text column to avoid processing empties
            df = df.dropna(subset=[text_col])
            total_rows = len(df)
            
            # Update upload tally to track maximum rows early
            upload_record = Upload.query.get(upload_record_id)
            if upload_record:
                upload_record.total_rows = total_rows
                db.session.commit()

            feedbacks = []
            
            for index, row in df.iterrows():
                orig_text = str(row[text_col])
                
                # Skip very empty rows
                if not orig_text.strip() or orig_text.lower() == 'nan':
                    continue
                    
                cleaned_text = preprocess_text(orig_text)
                sentiment, score = analyze_sentiment(cleaned_text)
                
                department = str(row[dept_col]) if dept_col and pd.notna(row[dept_col]) else None
                
                feedback = Feedback(
                    user_id=user_id,
                    upload_id=upload_record_id,
                    original_text=orig_text,
                    cleaned_text=cleaned_text,
                    sentiment=sentiment,
                    sentiment_score=score,
                    department_category=department
                )
                feedbacks.append(feedback)
                
                # Commit in chunks of 50 to show live progress and conserve memory
                if len(feedbacks) >= 50:
                    db.session.bulk_save_objects(feedbacks)
                    upload_record = Upload.query.get(upload_record_id)
                    if upload_record:
                        upload_record.processed_rows += len(feedbacks)
                    db.session.commit()
                    feedbacks = []

            # Save remaining
            if feedbacks:
                db.session.bulk_save_objects(feedbacks)
                upload_record = Upload.query.get(upload_record_id)
                if upload_record:
                    upload_record.processed_rows += len(feedbacks)
                db.session.commit()

            # Mark as totally completed
            upload_record = Upload.query.get(upload_record_id)
            if upload_record:
                upload_record.status = 'Completed'
                db.session.commit()

        except Exception as e:
            db.session.rollback()
            _mark_upload_status(upload_record_id, 'Failed', str(e))

def _mark_upload_status(upload_id, status, error_msg=None):
    upload_record = Upload.query.get(upload_id)
    if upload_record:
        upload_record.status = 'Failed'
        db.session.commit()
        if error_msg:
            print(f"Upload {upload_id} failed: {error_msg}")

def process_uploaded_file(filepath, upload_record_id, user_id, selected_column=None):
    """
    Entry point for the web route. Instantly spawns a background thread and returns to not block the browser.
    """
    app = current_app._get_current_object()
    thread = threading.Thread(
        target=process_uploaded_file_async,
        args=(app, filepath, upload_record_id, user_id, selected_column)
    )
    thread.daemon = True # Allows server to shut down freely
    thread.start()
    
    return True, "File uploaded successfully. Neural Network is processing rows in the background!"