File size: 16,058 Bytes
1ac9f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
import os
import re
import shutil
import pandas as pd
import tempfile
from flask import Flask, render_template, request, jsonify, session, redirect, url_for
from werkzeug.utils import secure_filename
import traceback
import secrets
from concurrent.futures import ThreadPoolExecutor, as_completed

# Local Modules
from core.parsers import process_file
from core.analytics import run_analytics_pipeline
from core.llm_service import generate_report


from collections import defaultdict
import time

request_counts = defaultdict(list)

def is_rate_limited(ip: str, limit: int = 10, window: int = 60) -> bool:
    """Allow max `limit` requests per `window` seconds per IP."""
    now = time.time()
    request_counts[ip] = [t for t in request_counts[ip] if now - t < window]
    if len(request_counts[ip]) >= limit:
        return True
    request_counts[ip].append(now)
    return False

app = Flask(__name__)
# 🛡️ Sentinel: Secure secret key using environment variable with a robust random fallback
app.secret_key = os.environ.get('FLASK_SECRET_KEY', os.urandom(32).hex())

app.config.update(
    UPLOAD_FOLDER=os.path.join(tempfile.gettempdir(), 'the_algorithm_uploads'),
    MAX_CONTENT_LENGTH=100 * 1024 * 1024, # 100 mb limit
    SESSION_COOKIE_SAMESITE='None',
    SESSION_COOKIE_SECURE=True,
    SESSION_COOKIE_HTTPONLY=True  # 🛡️ Sentinel: Prevent XSS session hijacking
)

# Simple server-side store to bypass 4KB session cookie limit
GLOBAL_DATA_STORE = {}

# Pre-compiled System Phrases for Highlights (Bolt Optimization V5.4)
SYSTEM_PHRASES_RE = re.compile(
    r'missed voice call|missed video call|end-to-end encrypted|tap for more info|'
    r'message was deleted|deleted this message|image omitted|video omitted|'
    r'audio omitted|sticker omitted|gif omitted|contact card omitted',
    re.IGNORECASE
)

# 🛡️ Sentinel: Strict file extension allowlist
ALLOWED_EXTENSIONS = {'txt', 'html', 'json', 'pdf'}


MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
ALLOWED_EXTENSIONS_SET = {'.txt', '.html', '.json'}

def validate_upload(file) -> tuple[bool, str]:
    """Returns (is_valid, error_message)"""
    if file.content_length and file.content_length > MAX_FILE_SIZE:
        return False, "File too large (max 10MB)"

    ext = os.path.splitext(file.filename)[1].lower()
    if ext not in ALLOWED_EXTENSIONS_SET:
        return False, "File type not supported"

    return True, ""

def allowed_file(filename):
    """Check if the uploaded file has a permitted extension."""
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

# Ensure upload directory exists
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

# ── Security Headers ──
@app.after_request
def add_security_headers(response):
    # Prevent MIME sniffing
    response.headers['X-Content-Type-Options'] = 'nosniff'
    # Control referrer information
    response.headers['Referrer-Policy'] = 'strict-origin-when-cross-origin'
    # Strict Transport Security
    response.headers['Strict-Transport-Security'] = 'max-age=31536000; includeSubDomains; preload'
    # Content Security Policy - allow embedding in HF, while providing clickjacking protection via frame-ancestors
    response.headers['Content-Security-Policy'] = (
        "default-src 'self'; "
        "script-src 'self' 'unsafe-inline' 'unsafe-eval' https://cdn.tailwindcss.com https://cdn.jsdelivr.net https://html2canvas.hertzen.com; "
        "style-src 'self' 'unsafe-inline' cdn.tailwindcss.com fonts.googleapis.com; "
        "font-src 'self' fonts.gstatic.com; "
        "img-src 'self' data: blob:; "
        "connect-src 'self' https://*.lit.ai; "
        "object-src 'none'; "
        "frame-ancestors 'self' https://*.huggingface.co https://huggingface.co https://*.pages.dev https://*.workers.dev;"
    )

    if request.path in ('/process', '/dashboard', '/flashback', '/highlights', '/clear'):
        response.headers['Cache-Control'] = 'no-store, no-cache, must-revalidate, max-age=0'
        response.headers['Pragma'] = 'no-cache'
        response.headers['Expires'] = '0'

    return response


@app.errorhandler(Exception)
def handle_error(e):
    # Log the full error internally
    app.logger.error(f"Unhandled error: {str(e)}", exc_info=True)
    # Return generic message to client
    return jsonify({"error": "An error occurred. Please try again."}), 500

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/clear')
def clear_session():
    """🛡️ Sentinel: Securely clear user data from RAM and session."""
    data_id = session.get('data_id')
    if data_id and data_id in GLOBAL_DATA_STORE:
        del GLOBAL_DATA_STORE[data_id]
    session.clear()
    return redirect(url_for('index'))

@app.route('/instructions')
def instructions():
    return render_template('instructions.html')

@app.route('/privacy')
def privacy():
    return render_template('privacy.html')

@app.route('/process', methods=['POST'])
def process_chat():

    client_ip = request.headers.get('X-Forwarded-For', request.remote_addr)
    if is_rate_limited(client_ip, limit=5, window=60):
        return jsonify({'error': 'Rate limit exceeded. Please try again later.'}), 429

    if 'chat_files' not in request.files:
        return jsonify({'error': 'No file part'}), 400
    
    files = request.files.getlist('chat_files')
    # 🛡️ Sentinel: Limit file count to prevent DoS
    if len(files) > 20:
        return jsonify({'error': 'Too many files. Maximum 20 allowed.'}), 400

    # 🛡️ Sentinel: Enforce length limits on names to prevent resource exhaustion/injection bloat
    my_name = request.form.get('my_name', '').strip()[:100]
    partner_name = request.form.get('partner_name', '').strip()[:100]

    # 🛡️ Sentinel: Strict allowlists for critical parameters
    connection_type = request.form.get('connection_type', 'romantic').strip()
    if connection_type not in ['romantic', 'friendship', 'professional', 'family', 'casual']:
        connection_type = 'romantic'

    output_language = request.form.get('output_language', 'english').strip()
    if output_language not in ['english', 'hinglish', 'hindi']:
        output_language = 'english'

    # 🛡️ Sentinel: Truncate user_context to 2,000 chars to prevent DoS via massive payload
    user_context = request.form.get('user_context', '').strip()[:2000]
    # 🛡️ Sentinel: Truncate api_key and hf_url to prevent resource abuse
    api_key = request.form.get('api_key', '').strip()[:512]
    hf_url = request.form.get('hf_url', '').strip()[:512]

    provider = request.form.get('llm_provider', 'openai').strip()
    if provider not in ['openai', 'anthropic', 'gemini', 'grok', 'xai']:
        provider = 'openai'
    
    if not my_name or not partner_name:
         return jsonify({'error': 'Both names are required'}), 400

    saved_files = []
    
    # 🛡️ Sentinel: Use TemporaryDirectory for per-request isolation and automatic cleanup
    with tempfile.TemporaryDirectory() as upload_dir:
        try:
            # 1. Save files temporarily
            for file in files:
                if file and file.filename:
                    is_valid, err_msg = validate_upload(file)
                    if not is_valid:
                        return jsonify({'error': f"{err_msg}: {file.filename}"}), 400

                    file_bytes = file.read()
                    if len(file_bytes) > MAX_FILE_SIZE:
                        return jsonify({'error': f"File too large: {file.filename}"}), 400
                    saved_files.append((file.filename, file_bytes))

            if not saved_files:
                 return jsonify({'error': 'No valid files uploaded'}), 400

            # 2. Parse Files Concurrently
            dfs = []
            parsing_errors = []
            # 🛡️ Sentinel: Cap max workers to prevent CPU starvation in smaller environments
            with ThreadPoolExecutor(max_workers=min(8, len(saved_files) + 4)) as executor:
                # Submit all parsing tasks
                future_to_filepath = {executor.submit(process_file, fn, fb, my_name, partner_name): fn for fn, fb in saved_files}
                
                for future in as_completed(future_to_filepath):
                    try:
                        df = future.result()
                        if not df.empty:
                            dfs.append(df)
                    except Exception as exc:
                        parsing_errors.append(str(exc))


            if not dfs:
                if parsing_errors:
                    err_str = str(parsing_errors[0])
                    safe_err = "A file format error or name mismatch occurred."
                    if "Name Mismatch" in err_str:
                        safe_err = "Name Mismatch: The provided names do not match the chat data."
                    elif "format" in err_str.lower():
                        safe_err = "Unsupported file format."
                    return jsonify({'error': safe_err}), 400
                return jsonify({'error': 'Could not extract any valid messages from the provided files.'}), 400

            full_df = pd.concat(dfs, ignore_index=True)

            # 🛡️ Sentinel: Enforce message limit to prevent memory exhaustion (DoS)
            if len(full_df) > 50000:
                return jsonify({'error': 'Too many messages. Maximum 50,000 allowed for analysis.'}), 400

            full_df.sort_values('timestamp', inplace=True)
            
            # 3. Analytics & Privacy Drop
            analytics_result = run_analytics_pipeline(full_df, hf_url=hf_url, connection_type=connection_type)

            if not analytics_result.get('weekly'):
                return jsonify({'error': 'Not enough data to form weekly statistics.'}), 400
                
            # 4. LLM Generation - Pass the entire analytics payload, not just weekly stats
            report = generate_report(provider, api_key, analytics_result, my_name, partner_name, connection_type, user_context, output_language)
            
            # 5. Store in Global Data Store (Session cookies are limited to 4KB)
            # 🛡️ Sentinel: Replace uuid with cryptographically secure token
            session_id = secrets.token_urlsafe(16)

            # 🛡️ Sentinel: Implement FIFO eviction to prevent memory exhaustion (DoS)
            if len(GLOBAL_DATA_STORE) >= 100:
                oldest_session = next(iter(GLOBAL_DATA_STORE))
                del GLOBAL_DATA_STORE[oldest_session]
            
            # Store df for flashbacks (privacy: only for duration of session)
            # Performance Optimization: Store as DataFrame to avoid slow to_dict() and
            # enable vectorized filtering in /flashback and /highlights.
            flashback_df = full_df[['timestamp', 'sender', 'text']].copy()

            GLOBAL_DATA_STORE[session_id] = {
                'stats': analytics_result,
                'report': report,
                'messages': flashback_df,
                'connection_type': connection_type
            }
            session['data_id'] = session_id

            return jsonify({'message': 'Processing completed successfully'})

        except Exception as e:


            return jsonify({'error': 'An internal server error occurred while processing your request.'}), 500
        finally:
            # 🛡️ Sentinel: Explicitly free memory by deleting references to large DataFrames
            if 'dfs' in locals(): del dfs
            if 'full_df' in locals(): del full_df
            if 'flashback_df' in locals(): del flashback_df

@app.route('/dashboard')
def dashboard():
    data_id = session.get('data_id')
    if not data_id or data_id not in GLOBAL_DATA_STORE:
        return render_template('index.html') # Redirect to start if no data
    
    data = GLOBAL_DATA_STORE[data_id]
    return render_template('dashboard.html', stats=data['stats'], report=data['report'])

@app.route('/flashback')
def get_flashback():
    data_id = session.get('data_id')
    week_start = request.args.get('week')
    if not data_id or not week_start or data_id not in GLOBAL_DATA_STORE:
        return jsonify([])

    # 🛡️ Sentinel: Truncate week_start to prevent potential ReDoS or parsing issues
    week_start = week_start.strip()[:50]
        
    df = GLOBAL_DATA_STORE[data_id]['messages']
    
    # Filter messages for that week
    try:
        ws_dt = pd.to_datetime(week_start)
        we_dt = ws_dt + pd.Timedelta(days=7)
        
        # Performance Optimization: Replace O(N) Python loop with vectorized Pandas filtering
        mask = (df['timestamp'] >= ws_dt) & (df['timestamp'] < we_dt)
        messages_in_week = df[mask].head(50).copy()
        messages_in_week['timestamp'] = messages_in_week['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
                
        # Sample 8 representative ones
        return jsonify(messages_in_week.to_dict(orient='records')[:8])
    except Exception as e:
        print(f"Flashback error: {e}")
        return jsonify([])

@app.route('/highlights')
def get_highlights():
    data_id = session.get('data_id')
    if not data_id or data_id not in GLOBAL_DATA_STORE:
        return jsonify({'highlights': []})
        
    df = GLOBAL_DATA_STORE[data_id].get('messages')
    connection_type = GLOBAL_DATA_STORE[data_id].get('connection_type', 'romantic')
    
    if df is None or df.empty:
        return jsonify({'highlights': []})

    # Filter messages that are reasonably substantial, not just media/links, and not tiny reactions
    # Performance Optimization: Use vectorized Pandas string operations for filtering
    # and pre-compiled SYSTEM_PHRASES_RE.
    t_series = df['text'].astype(str)

    # Check for system phrases using vectorized regex search (V5.4 Bolt Optimization)
    is_sys_msg = t_series.str.contains(SYSTEM_PHRASES_RE, na=False)

    # Combined filter: length 15-150, no media tags, no links, not system message
    mask = (
        (t_series.str.len() > 15) &
        (t_series.str.len() < 150) &
        (~t_series.str.startswith('<Media', na=False)) &
        (~t_series.str.contains('http', na=False)) &
        (~is_sys_msg)
    )

    valid_df = df[mask]
            
    if valid_df.empty:
        return jsonify({'highlights': []})
        
    # 🛡️ Sentinel: Use cryptographically secure random for highlight selection (Bandit B311)
    secure_random = secrets.SystemRandom()

    # Sample up to 5 highlights
    sample_size = min(5, len(valid_df))
    sampled_indices = secure_random.sample(range(len(valid_df)), sample_size)
    sampled_df = valid_df.iloc[sampled_indices].copy()

    # Performance Optimization: Format timestamp to string once for the sample
    sampled_df['timestamp'] = sampled_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # Convert to list of dicts for processing
    sampled = sampled_df.to_dict(orient='records')
    
    highlights = []
    for msg in sampled:
        sender_label = "You" if msg.get('sender') == 'ME' else "Partner"
        
        # Contextual titles based on connection type
        titles = ["A Memory"]
        if connection_type == 'romantic':
             titles = ["A Sweet Moment", "Looking Back", "A Spark", "Connection Highlight"]
        elif connection_type == 'friend':
             titles = ["A Fun Memory", "Vibes", "Remember This?", "Friendship Highlight"]
        elif connection_type == 'professional':
             titles = ["Collaboration Note", "Discussion Point", "Key Exchange"]
             
        title = secure_random.choice(titles)
        
        highlights.append({
            'title': title,
            'sender': sender_label,
            'text': msg.get('text', ''),
            'timestamp': msg.get('timestamp', '')
        })
        
    return jsonify({'highlights': highlights, 'connection_type': connection_type})


if __name__ == '__main__':
    debug_mode = os.environ.get('FLASK_DEBUG', 'False').lower() in ['true', '1', 't']
    app.run(debug=debug_mode, port=5000)