SuriRaja commited on
Commit
954b14a
·
verified ·
1 Parent(s): 82077f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -525
app.py CHANGED
@@ -1,582 +1,195 @@
1
- import json
2
- from typing import Any, Dict, List, Optional, Tuple
3
- from io import BytesIO
4
- import tempfile
5
-
6
- import gradio as gr
7
  import pandas as pd
8
- import matplotlib
9
- matplotlib.use("Agg")
10
  import matplotlib.pyplot as plt
11
-
12
- from transformers import AutoTokenizer, AutoModelForCausalLM
13
  from fpdf import FPDF
 
 
 
14
 
15
  # ------------------ MODEL LOADING ------------------
16
-
17
  MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 
 
 
 
 
 
 
 
 
18
 
19
- print("Loading model... This runs once at startup.")
20
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
21
- model = AutoModelForCausalLM.from_pretrained(
22
- MODEL_NAME,
23
- device_map="auto",
24
- torch_dtype="auto"
25
- )
26
-
27
- # ------------------ LLM HELPERS ------------------
28
-
29
- def generate_llm(
30
- prompt: str,
31
- max_new_tokens: int = 512,
32
- temperature: float = 0.1
33
- ) -> str:
34
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
35
  outputs = model.generate(
36
  **inputs,
37
  max_new_tokens=max_new_tokens,
38
- do_sample=(temperature > 0),
39
- temperature=temperature,
40
- pad_token_id=tokenizer.eos_token_id
41
  )
42
- full = tokenizer.decode(outputs[0], skip_special_tokens=True)
43
- return full[len(prompt):].strip()
44
 
45
 
46
  INTENT_SYSTEM_PROMPT = """
47
- You are a Log Intelligence Planner for a pharma company.
48
-
49
- You receive a natural-language question from a user about login/access activity
50
- of scientists or employees across multiple systems and time ranges.
51
 
52
- Your job is to convert the question into a JSON object describing WHAT to do.
 
 
 
 
53
 
54
- ALLOWED actions:
55
- - "run_log_query" : Basic filtered query on logs.
56
- - "scan_anomalies" : Scan for suspicious behaviour (off-hours, many systems, failures).
57
- - "user_risk_report" : High-level risk report for one or more users.
58
- - "global_risk_report" : High-level risk report for all users.
59
-
60
- JSON SCHEMA (always follow this):
61
  {
62
- "action": "<one of the above>",
63
- "parameters": {
64
- "users": "any" OR ["Name1", "Name2"],
65
- "time_range": "all_time" OR natural text like "last_7_days", "yesterday", "this_week",
66
- "focus": "login_failures" | "off_hours" | "many_systems" | "impossible_travel" | "general",
67
- "extra": "<free text, optional>"
68
- }
69
  }
70
 
71
  RULES:
72
- - ALWAYS output ONLY valid JSON. No explanation, no markdown, no comments.
73
- - If you are unsure, choose a reasonable default:
74
- - users = "any"
75
- - time_range = "all_time"
76
- - focus = "general"
77
- - If question is not about logs at all, still output JSON with action "run_log_query"
78
- and parameters filled with "any"/"all_time"/"general".
79
  """
80
 
81
- def extract_intent(user_message: str) -> Dict[str, Any]:
82
- user_block = f'USER_QUESTION: "{user_message}"\n\nReturn ONLY the JSON object now:'
83
- prompt = INTENT_SYSTEM_PROMPT + "\n" + user_block
84
- raw = generate_llm(prompt, max_new_tokens=256, temperature=0.1)
85
-
86
- try:
87
- first = raw.find("{")
88
- last = raw.rfind("}")
89
- if first != -1 and last != -1:
90
- raw_json = raw[first:last + 1]
91
- else:
92
- raw_json = raw
93
- data = json.loads(raw_json)
94
- except Exception:
95
- data = {
96
- "action": "run_log_query",
97
- "parameters": {
98
- "users": "any",
99
- "time_range": "all_time",
100
- "focus": "general",
101
- "extra": user_message
102
- }
103
- }
104
- return data
105
-
106
-
107
  SUMMARY_SYSTEM_PROMPT = """
108
- You are a Security & Compliance Analyst for a pharma company.
109
-
110
- You receive:
111
- 1) The original user question.
112
- 2) A short description of how the logs were filtered.
113
- 3) A small sample of matching rows (already filtered from CSV).
114
- 4) A list of detected anomalies (if any).
115
-
116
- You must:
117
- - Explain findings in clear, simple language for HR / Security managers.
118
- - Highlight suspicious behaviour and why it might be risky.
119
- - Suggest 2–5 next actions (e.g., confirm travel, reset password, investigate device, etc.).
120
-
121
- FORMAT:
122
- - Start with a 1–2 line summary.
123
- - Then bullet points of key observations.
124
- - Then "Recommended actions:" with bullet points.
125
  """
126
 
127
- def generate_summary(
128
- user_question: str,
129
- filter_description: str,
130
- sample_rows: pd.DataFrame,
131
- anomalies: List[Dict[str, Any]]
132
- ) -> str:
133
- if not sample_rows.empty:
134
- sample_text = sample_rows.to_markdown(index=False)
135
- else:
136
- sample_text = "No matching rows."
137
-
138
- anomalies_text = json.dumps(anomalies, indent=2) if anomalies else "[]"
139
 
140
- prompt = SUMMARY_SYSTEM_PROMPT + "\n\n"
141
- prompt += "USER QUESTION:\n" + user_question + "\n\n"
142
- prompt += "FILTER DESCRIPTION:\n" + filter_description + "\n\n"
143
- prompt += "SAMPLE MATCHING ROWS (first few):\n" + sample_text + "\n\n"
144
- prompt += "DETECTED ANOMALIES (JSON list):\n" + anomalies_text + "\n\n"
145
- prompt += "Now write the report:\n"
146
-
147
- return generate_llm(prompt, max_new_tokens=512, temperature=0.2)
148
 
 
 
 
 
 
 
 
 
149
 
150
- # ------------------ CSV & ANOMALY ENGINE ------------------
151
 
152
- def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
153
- df = df.copy()
154
- df.columns = [c.strip().lower() for c in df.columns]
155
  return df
156
 
157
-
158
- def basic_time_filter(df: pd.DataFrame, time_range: str) -> pd.DataFrame:
159
- if "timestamp" not in df.columns:
160
- return df
161
-
162
- df = df.copy()
163
- df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
164
- df = df.dropna(subset=["timestamp"])
165
-
166
- if time_range in ["all_time", None, "unknown"]:
167
  return df
168
-
169
- now = df["timestamp"].max()
170
- if pd.isna(now):
171
- return df
172
-
173
- if time_range in ["last_7_days", "this_week"]:
174
- cutoff = now - pd.Timedelta(days=7)
175
- return df[df["timestamp"] >= cutoff]
176
- elif time_range in ["yesterday"]:
177
- start = (now - pd.Timedelta(days=1)).normalize()
178
- end = start + pd.Timedelta(days=1)
179
- return df[(df["timestamp"] >= start) & (df["timestamp"] < end)]
180
- elif time_range in ["last_30_days", "this_month"]:
181
- cutoff = now - pd.Timedelta(days=30)
182
- return df[df["timestamp"] >= cutoff]
183
- else:
184
- return df
185
-
186
-
187
- def basic_user_filter(df: pd.DataFrame, users: Any) -> pd.DataFrame:
188
- df = df.copy()
189
- user_col = None
190
- for cand in ["user", "username", "scientist", "employee"]:
191
- if cand in df.columns:
192
- user_col = cand
193
- break
194
- if user_col is None:
195
- return df
196
-
197
- if users == "any" or users is None:
198
- return df
199
-
200
  if isinstance(users, str):
201
  users = [users]
202
-
203
- users_norm = [u.strip().lower() for u in users]
204
- return df[df[user_col].astype(str).str.lower().isin(users_norm)]
205
-
206
-
207
- def detect_anomalies(
208
- df: pd.DataFrame,
209
- focus: str = "general"
210
- ) -> List[Dict[str, Any]]:
211
- anomalies: List[Dict[str, Any]] = []
212
- if df.empty:
213
- return anomalies
214
-
215
- df = df.copy()
216
- if "timestamp" in df.columns:
217
- df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
218
-
219
- # 1) Login failures
220
- if focus in ["general", "login_failures"]:
221
- fail_mask = False
222
- for col in ["status", "result", "action"]:
223
- if col in df.columns:
224
- fail_mask = fail_mask | df[col].astype(str).str.lower().str.contains("fail")
225
- failed = df[fail_mask]
226
- if not failed.empty:
227
- user_col = None
228
- for cand in ["user", "username", "scientist", "employee"]:
229
- if cand in df.columns:
230
- user_col = cand
231
- break
232
- if user_col:
233
- by_user = failed.groupby(user_col)
234
- for user, group in by_user:
235
- if len(group) >= 3:
236
- anomalies.append({
237
- "type": "login_failures",
238
- "user": str(user),
239
- "count": int(len(group)),
240
- "details": f"{len(group)} failed events found for {user}"
241
- })
242
-
243
- # 2) Off-hours (23:00–06:00)
244
- if "timestamp" in df.columns and focus in ["general", "off_hours"]:
245
- df["hour"] = df["timestamp"].dt.hour
246
- off = df[(df["hour"] >= 23) | (df["hour"] < 6)]
247
- if not off.empty:
248
- user_col = None
249
- for cand in ["user", "username", "scientist", "employee"]:
250
- if cand in df.columns:
251
- user_col = cand
252
- break
253
- if user_col:
254
- off_counts = off.groupby(user_col).size().reset_index(name="count")
255
- for _, row in off_counts.iterrows():
256
- anomalies.append({
257
- "type": "off_hours",
258
- "user": str(row[user_col]),
259
- "count": int(row["count"]),
260
- "details": f"{row['count']} off-hours events"
261
- })
262
-
263
- # 3) Many systems in a day (>= 5)
264
- if focus in ["general", "many_systems"]:
265
- user_col = None
266
- for cand in ["user", "username", "scientist", "employee"]:
267
- if cand in df.columns:
268
- user_col = cand
269
- break
270
- sys_col = None
271
- for cand in ["system", "application", "app"]:
272
- if cand in df.columns:
273
- sys_col = cand
274
- break
275
- if user_col and sys_col and "timestamp" in df.columns:
276
- df["date"] = df["timestamp"].dt.date
277
- combo = df.groupby([user_col, "date"])[sys_col].nunique().reset_index(name="system_count")
278
- many = combo[combo["system_count"] >= 5]
279
- for _, row in many.iterrows():
280
- anomalies.append({
281
- "type": "many_systems",
282
- "user": str(row[user_col]),
283
- "date": str(row["date"]),
284
- "system_count": int(row["system_count"]),
285
- "details": f"Accessed {row['system_count']} systems on {row['date']}"
286
- })
287
-
288
- # 4) Impossible travel – same user, 2 locations in same day
289
- if focus in ["general", "impossible_travel"]:
290
- user_col = None
291
- for cand in ["user", "username", "scientist", "employee"]:
292
- if cand in df.columns:
293
- user_col = cand
294
- break
295
- loc_col = None
296
- for cand in ["country", "location", "geo"]:
297
- if cand in df.columns:
298
- loc_col = cand
299
- break
300
- if user_col and loc_col and "timestamp" in df.columns:
301
- df["date"] = df["timestamp"].dt.date
302
- grouped = df.groupby([user_col, "date"])
303
- for (user, date), group in grouped:
304
- locations = group[loc_col].astype(str).str.strip().str.lower().unique()
305
- if len(locations) >= 2:
306
- anomalies.append({
307
- "type": "impossible_travel",
308
- "user": str(user),
309
- "date": str(date),
310
- "locations": list(map(str, locations)),
311
- "details": f"Multiple locations {list(locations)} in single day"
312
- })
313
-
314
  return anomalies
315
 
 
 
 
 
 
 
316
 
317
- def apply_intent_to_dataframe(
318
- df: pd.DataFrame,
319
- intent: Dict[str, Any]
320
- ) -> Tuple[pd.DataFrame, List[Dict[str, Any]], str]:
321
- df = normalize_column_names(df)
322
- action = intent.get("action", "run_log_query")
323
- params = intent.get("parameters", {})
324
- users = params.get("users", "any")
325
- time_range = params.get("time_range", "all_time")
326
- focus = params.get("focus", "general")
327
-
328
- filtered = basic_time_filter(df, time_range)
329
- filtered = basic_user_filter(filtered, users)
330
-
331
- filter_desc = f"Action: {action}, Users: {users}, Time: {time_range}, Focus: {focus}"
332
-
333
- anomalies: List[Dict[str, Any]] = []
334
- if action in ["scan_anomalies", "user_risk_report", "global_risk_report", "run_log_query"]:
335
- anomalies = detect_anomalies(filtered, focus=focus)
336
-
337
- return filtered, anomalies, filter_desc
338
 
339
-
340
- def calculate_risk_score(anomalies: List[Dict[str, Any]]):
341
- if not anomalies:
342
- return "🟢", "Low", 0
343
- count = len(anomalies)
344
- if count <= 2:
345
- return "🟡", "Medium", count
346
- return "🔴", "High", count
347
-
348
-
349
- def generate_bar_chart(df: pd.DataFrame):
350
- if df.empty or "system" not in df.columns:
351
- return None
352
- fig, ax = plt.subplots(figsize=(6, 3))
353
- data = df["system"].value_counts()
354
- ax.bar(data.index, data.values)
355
- ax.set_title("Events per System")
356
- ax.set_xlabel("System")
357
- ax.set_ylabel("Events")
358
- plt.xticks(rotation=20)
359
- fig.tight_layout()
360
- return fig
361
-
362
-
363
- def build_pdf_report(summary_text, anomalies, risk_icon, risk_label):
364
  pdf = FPDF()
365
  pdf.add_page()
366
  pdf.set_font("Arial", size=12)
367
-
368
- pdf.multi_cell(0, 10, "Security Report Smart Log Copilot", align="L")
369
- pdf.ln(2)
370
- pdf.multi_cell(0, 10, f"Risk Level: {risk_icon} {risk_label}", align="L")
371
- pdf.ln(5)
372
-
373
- pdf.set_font("Arial", size=11)
374
- pdf.multi_cell(0, 7, "Summary:", align="L")
375
- pdf.set_font("Arial", size=10)
376
- pdf.multi_cell(0, 6, summary_text)
377
- pdf.ln(5)
378
-
379
- pdf.set_font("Arial", size=11)
380
- pdf.multi_cell(0, 7, "Detected Anomalies:", align="L")
381
- pdf.set_font("Arial", size=10)
382
  if anomalies:
383
- for an in anomalies:
384
- line = f"- {an.get('type', '')}: {an.get('details', '')}"
385
- pdf.multi_cell(0, 6, line)
386
  else:
387
- pdf.multi_cell(0, 6, "No anomalies detected.")
388
-
389
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
390
  pdf.output(tmp.name)
391
  return tmp.name
392
 
393
 
394
- # ------------------ DEMO DESCRIPTION ------------------
 
 
395
 
396
- DESCRIPTION_MD = """
397
- # 🔍 Smart Log Copilot (CSV Demo)
 
 
 
 
398
 
399
- **Use case:** Pharma / corporate security teams analysing login & access logs.
400
-
401
- 1. Upload a **CSV log file** (with columns like `timestamp`, `user`, `system`, `status`, `country`, etc.)
402
- 2. Ask questions in **plain English**, e.g.:
403
- - *"Was Dr. Rao doing anything suspicious this week?"*
404
- - *"Who logged in late at night?"*
405
- - *"Who accessed too many systems in a day?"*
406
- 3. The app will:
407
- - Interpret your question via a local LLM (Qwen 1.5B)
408
- - Filter & analyse the CSV with Pandas
409
- - Run anomaly rules (off-hours, failures, many systems, impossible travel)
410
- - Return an easy-to-read summary + risk level + optional PDF report.
411
-
412
- > For demo: a **placeholder anomaly screenshot** is shown whenever anomalies are found.
413
- """
414
 
415
- PLACEHOLDER_IMAGE_URL = "https://dummyimage.com/600x300/ff0000/ffffff&text=Anomaly+Screenshot+Placeholder"
 
416
 
 
417
 
418
- # ------------------ CORE CHAT LOGIC ------------------
 
 
 
 
 
 
419
 
420
- def load_csv(file_obj):
421
- if file_obj is None:
422
- return pd.DataFrame(), pd.DataFrame(), "No file uploaded yet."
423
- try:
424
- df = pd.read_csv(file_obj.name)
425
- df = normalize_column_names(df)
426
- info = f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns."
427
- return df, df.head(20), info
428
- except Exception as e:
429
- return pd.DataFrame(), pd.DataFrame(), f"Error loading CSV: {e}"
430
-
431
-
432
- def chat_logic(user_message: str, df_state: pd.DataFrame):
433
- intent = extract_intent(user_message)
434
- filtered_df, anomalies, filter_desc = apply_intent_to_dataframe(df_state, intent)
435
-
436
- sample = filtered_df.head(30)
437
- summary = generate_summary(
438
- user_question=user_message,
439
- filter_description=filter_desc,
440
- sample_rows=sample,
441
- anomalies=anomalies
442
- )
443
-
444
- img = PLACEHOLDER_IMAGE_URL if anomalies else ""
445
- return summary, img, filtered_df, anomalies
446
-
447
-
448
- def on_user_message(user_message, chat_history, df):
449
- # Append user message
450
- chat_history = chat_history + [{"role": "user", "content": user_message}]
451
-
452
- if df is None or df.empty:
453
- reply = "📂 Please upload a CSV file with logs first."
454
- chat_history = chat_history + [{"role": "assistant", "content": reply}]
455
- return chat_history, gr.update(visible=False), gr.update(visible=False), None
456
-
457
- summary_text, img, filtered_df, anomalies = chat_logic(user_message, df)
458
-
459
- risk_icon, risk_label, _ = calculate_risk_score(anomalies)
460
- reply_text = f"{risk_icon} **Risk Level: {risk_label}**\n\n" + summary_text
461
-
462
- chat_history = chat_history + [{"role": "assistant", "content": reply_text}]
463
-
464
- # Chart
465
- fig = generate_bar_chart(filtered_df)
466
- if fig is not None:
467
- chart_update = gr.update(value=fig, visible=True)
468
- else:
469
- chart_update = gr.update(visible=False)
470
 
471
- # Report meta state
472
- report_meta = (reply_text, anomalies, risk_icon, risk_label)
 
473
 
474
- # Screenshot
475
- if img:
476
- img_update = gr.update(value=img, visible=True)
477
- else:
478
- img_update = gr.update(visible=False)
479
-
480
- return chat_history, img_update, chart_update, report_meta
481
-
482
-
483
- def on_generate_report(report_meta):
484
- if not report_meta:
485
- return gr.update(visible=False)
486
- summary_text, anomalies, risk_icon, risk_label = report_meta
487
- pdf_path = build_pdf_report(summary_text, anomalies, risk_icon, risk_label)
488
- return gr.update(value=pdf_path, visible=True)
489
-
490
-
491
- # ------------------ GRADIO UI ------------------
492
-
493
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", neutral_hue="gray")) as demo:
494
- gr.Markdown(DESCRIPTION_MD)
495
-
496
- with gr.Row():
497
- with gr.Column(scale=2):
498
- file_input = gr.File(label="Upload CSV log file", file_types=[".csv"])
499
- load_btn = gr.Button("Load CSV")
500
- load_info = gr.Markdown("No file loaded.")
501
- with gr.Column(scale=3):
502
- df_preview = gr.Dataframe(
503
- label="CSV Preview (first 20 rows)",
504
- interactive=False,
505
- visible=True
506
- )
507
-
508
- df_state = gr.State(pd.DataFrame())
509
-
510
- def on_load_csv(file_obj):
511
- df, preview, info = load_csv(file_obj)
512
- return df, preview, info
513
-
514
- load_btn.click(
515
- fn=on_load_csv,
516
- inputs=[file_input],
517
- outputs=[df_state, df_preview, load_info]
518
- )
519
-
520
- gr.Markdown("---")
521
- gr.Markdown("### 💬 Smart Log Copilot")
522
-
523
- with gr.Row():
524
- with gr.Column(scale=3):
525
- chatbot = gr.Chatbot(
526
- label=None,
527
- type="messages",
528
- )
529
- msg = gr.Textbox(
530
- placeholder="Ask a question like: Who logged in late at night?",
531
- show_label=False,
532
- lines=2
533
- )
534
- send_btn = gr.Button("Send", variant="primary")
535
- with gr.Column(scale=2):
536
- anomaly_image = gr.Image(
537
- label="Anomaly Screenshot (placeholder)",
538
- visible=False
539
- )
540
- chart_plot = gr.Plot(
541
- label="Log Activity Chart",
542
- visible=False
543
- )
544
- report_btn = gr.Button("Generate PDF Report", variant="secondary")
545
- pdf_file = gr.File(label="Download Security Report", visible=False)
546
-
547
- report_state = gr.State()
548
-
549
- send_btn.click(
550
- fn=on_user_message,
551
- inputs=[msg, chatbot, df_state],
552
- outputs=[chatbot, anomaly_image, chart_plot, report_state]
553
- )
554
-
555
- msg.submit(
556
- fn=on_user_message,
557
- inputs=[msg, chatbot, df_state],
558
- outputs=[chatbot, anomaly_image, chart_plot, report_state]
559
- )
560
-
561
- report_btn.click(
562
- fn=on_generate_report,
563
- inputs=[report_state],
564
- outputs=[pdf_file]
565
- )
566
-
567
- gr.Markdown(
568
- """
569
- **Tip:** Use a demo CSV with columns like:
570
- `timestamp, user, system, status, country`
571
- and deliberately add:
572
- - multiple failed logins,
573
- - some late-night logins,
574
- - same user in 2 countries on same day,
575
- - a day where a user touches 5+ systems.
576
-
577
- Then ask natural questions and let the system explain.
578
- """
579
- )
580
-
581
- if __name__ == "__main__":
582
- demo.launch()
 
1
+ import streamlit as st
 
 
 
 
 
2
  import pandas as pd
3
+ import json
 
4
  import matplotlib.pyplot as plt
 
 
5
  from fpdf import FPDF
6
+ import tempfile
7
+ from io import BytesIO
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
 
10
  # ------------------ MODEL LOADING ------------------
 
11
  MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
12
+ @st.cache_resource
13
+ def load_llm():
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_NAME,
17
+ device_map="auto",
18
+ torch_dtype="auto"
19
+ )
20
+ return tokenizer, model
21
 
22
+ tokenizer, model = load_llm()
23
+
24
+
25
+ def llm(prompt, max_new_tokens=400):
 
 
 
 
 
 
 
 
 
 
 
26
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
27
  outputs = model.generate(
28
  **inputs,
29
  max_new_tokens=max_new_tokens,
30
+ pad_token_id=tokenizer.eos_token_id,
31
+ do_sample=False,
 
32
  )
33
+ return tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()
 
34
 
35
 
36
  INTENT_SYSTEM_PROMPT = """
37
+ You convert natural-language questions into a JSON task plan for log analysis.
 
 
 
38
 
39
+ VALID actions:
40
+ - "run_log_query"
41
+ - "scan_anomalies"
42
+ - "user_risk_report"
43
+ - "global_risk_report"
44
 
45
+ OUTPUT FORMAT:
 
 
 
 
 
 
46
  {
47
+ "action": "",
48
+ "parameters": {
49
+ "users": "any" or ["name"],
50
+ "time_range": "all_time" or natural text,
51
+ "focus": "general" or "login_failures" or "off_hours" or "many_systems" or "impossible_travel",
52
+ "extra": "<free>"
53
+ }
54
  }
55
 
56
  RULES:
57
+ - ONLY output JSON.
 
 
 
 
 
 
58
  """
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  SUMMARY_SYSTEM_PROMPT = """
61
+ You write human-friendly summaries for security managers.
62
+ Explain risks clearly + list recommended actions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  """
64
 
65
+ PLACEHOLDER_IMG = "https://dummyimage.com/600x300/ff0000/ffffff&text=Anomaly+Screenshot"
 
 
 
 
 
 
 
 
 
 
 
66
 
 
 
 
 
 
 
 
 
67
 
68
+ def extract_intent(msg):
69
+ p = INTENT_SYSTEM_PROMPT + "\nUSER QUESTION: " + msg + "\nReturn JSON now:"
70
+ raw = llm(p)
71
+ try:
72
+ raw_json = raw[raw.find("{"): raw.rfind("}") + 1]
73
+ return json.loads(raw_json)
74
+ except:
75
+ return {"action": "run_log_query", "parameters": {"users": "any", "time_range": "all_time", "focus": "general", "extra": msg}}
76
 
 
77
 
78
+ # ------------------ CSV + ANALYTICS ------------------
79
+ def normalize(df):
80
+ df.columns = [c.lower().strip() for c in df.columns]
81
  return df
82
 
83
+ def basic_filter(df, users):
84
+ if users == "any":
 
 
 
 
 
 
 
 
85
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  if isinstance(users, str):
87
  users = [users]
88
+ users = [u.lower() for u in users]
89
+ return df[df["user"].str.lower().isin(users)]
90
+
91
+ def detect_anomalies(df):
92
+ anomalies = []
93
+ # failed logins
94
+ fails = df[df["status"].str.contains("fail", case=False, na=False)]
95
+ if len(fails) >= 3:
96
+ anomalies.append({"type": "login_failures", "details": f"{len(fails)} failed logins found"})
97
+ # off-hours
98
+ df["timestamp"] = pd.to_datetime(df["timestamp"])
99
+ off = df[(df["timestamp"].dt.hour >= 23) | (df["timestamp"].dt.hour < 6)]
100
+ if len(off) > 0:
101
+ anomalies.append({"type": "off_hours", "details": f"{len(off)} off-hours logins"})
102
+ # many systems
103
+ sys_count = df.groupby(df["timestamp"].dt.date).system.nunique()
104
+ if any(sys_count >= 5):
105
+ anomalies.append({"type": "many_systems", "details": "5+ systems accessed in a day"})
106
+ # impossible travel
107
+ if "country" in df.columns:
108
+ locations = df.groupby(df["timestamp"].dt.date).country.nunique()
109
+ if any(locations >= 2):
110
+ anomalies.append({"type": "impossible_travel", "details": "Multiple countries in one day"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  return anomalies
112
 
113
+ def risk_score(anoms):
114
+ if not anoms:
115
+ return "🟢", "Low"
116
+ if len(anoms) <= 2:
117
+ return "🟡", "Medium"
118
+ return "🔴", "High"
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ def build_pdf(risk_icon, risk_label, summary, anomalies):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  pdf = FPDF()
123
  pdf.add_page()
124
  pdf.set_font("Arial", size=12)
125
+ pdf.multi_cell(0, 8, f"Security Report – Smart Log Copilot")
126
+ pdf.multi_cell(0, 8, f"Risk Level: {risk_icon} {risk_label}")
127
+ pdf.ln(4)
128
+ pdf.multi_cell(0, 6, summary)
129
+ pdf.ln(4)
130
+ pdf.multi_cell(0, 6, "Detected Anomalies:")
 
 
 
 
 
 
 
 
 
131
  if anomalies:
132
+ for a in anomalies:
133
+ pdf.multi_cell(0, 6, f"- {a['type']}: {a['details']}")
 
134
  else:
135
+ pdf.multi_cell(0, 6, "None")
 
136
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
137
  pdf.output(tmp.name)
138
  return tmp.name
139
 
140
 
141
+ # ------------------ STREAMLIT UI ------------------
142
+ st.set_page_config(page_title="Smart Log Copilot", layout="wide")
143
+ st.title("🔍 Smart Log Copilot (CSV-powered LLM Demo)")
144
 
145
+ uploaded = st.file_uploader("Upload CSV log file", type=["csv"])
146
+ df = None
147
+ if uploaded:
148
+ df = normalize(pd.read_csv(uploaded))
149
+ st.success(f"CSV loaded ({len(df)} rows)")
150
+ st.dataframe(df.head(20))
151
 
152
+ st.markdown("---")
153
+ chat_input = st.text_input("Ask a question about the logs:")
154
+ report_slot = st.empty()
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ if "history" not in st.session_state:
157
+ st.session_state.history = []
158
 
159
+ col1, col2 = st.columns([3, 2])
160
 
161
+ with col1:
162
+ if chat_input and df is not None:
163
+ intent = extract_intent(chat_input)
164
+ params = intent["parameters"]
165
+ filtered = basic_filter(df, params["users"])
166
+ anomalies = detect_anomalies(filtered)
167
+ icon, label = risk_score(anomalies)
168
 
169
+ p = SUMMARY_SYSTEM_PROMPT + f"\nQUESTION: {chat_input}\nMATCHED: {len(filtered)} rows\nANOMALIES: {json.dumps(anomalies)}\n\nWrite summary:"
170
+ summary = llm(p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ bot_reply = f"{icon} **Risk Level: {label}**\n\n{summary}"
173
+ st.session_state.history.append(("user", chat_input))
174
+ st.session_state.history.append(("assistant", bot_reply))
175
 
176
+ for role, text in st.session_state.history:
177
+ if role == "user":
178
+ st.chat_message("user").write(text)
179
+ else:
180
+ st.chat_message("assistant").write(text)
181
+
182
+ with col2:
183
+ if df is not None and chat_input:
184
+ if anomalies:
185
+ st.image(PLACEHOLDER_IMG, caption="Anomaly Screenshot")
186
+
187
+ fig, ax = plt.subplots(figsize=(4, 2))
188
+ df["system"].value_counts().plot(kind="bar", ax=ax)
189
+ st.pyplot(fig)
190
+
191
+ pdf_btn = st.button("📄 Download PDF Report")
192
+ if pdf_btn:
193
+ pdf_path = build_pdf(icon, label, summary, anomalies)
194
+ with open(pdf_path, "rb") as f:
195
+ st.download_button("Download PDF", f, file_name="security_report.pdf", mime="application/pdf")