SuriRaja commited on
Commit
f6db94e
·
verified ·
1 Parent(s): 15e521f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -100
app.py CHANGED
@@ -1,11 +1,18 @@
1
  import json
2
- import textwrap
3
  from typing import Any, Dict, List, Optional, Tuple
 
 
4
 
5
  import gradio as gr
6
  import pandas as pd
 
 
 
 
7
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
8
 
 
9
 
10
  MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
11
 
@@ -17,15 +24,13 @@ model = AutoModelForCausalLM.from_pretrained(
17
  torch_dtype="auto"
18
  )
19
 
20
-
21
- # ---------- LLM HELPERS ----------
22
 
23
  def generate_llm(
24
  prompt: str,
25
  max_new_tokens: int = 512,
26
  temperature: float = 0.1
27
  ) -> str:
28
- """Simple text generation helper."""
29
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
30
  outputs = model.generate(
31
  **inputs,
@@ -35,7 +40,6 @@ def generate_llm(
35
  pad_token_id=tokenizer.eos_token_id
36
  )
37
  full = tokenizer.decode(outputs[0], skip_special_tokens=True)
38
- # Return only the new text after the prompt
39
  return full[len(prompt):].strip()
40
 
41
 
@@ -74,14 +78,11 @@ RULES:
74
  and parameters filled with "any"/"all_time"/"general".
75
  """
76
 
77
-
78
  def extract_intent(user_message: str) -> Dict[str, Any]:
79
- """Call LLM to convert user message → intent JSON."""
80
  user_block = f'USER_QUESTION: "{user_message}"\n\nReturn ONLY the JSON object now:'
81
  prompt = INTENT_SYSTEM_PROMPT + "\n" + user_block
82
  raw = generate_llm(prompt, max_new_tokens=256, temperature=0.1)
83
 
84
- # Try to extract JSON from model output
85
  try:
86
  first = raw.find("{")
87
  last = raw.rfind("}")
@@ -91,7 +92,6 @@ def extract_intent(user_message: str) -> Dict[str, Any]:
91
  raw_json = raw
92
  data = json.loads(raw_json)
93
  except Exception:
94
- # Fallback safe default
95
  data = {
96
  "action": "run_log_query",
97
  "parameters": {
@@ -130,8 +130,11 @@ def generate_summary(
130
  sample_rows: pd.DataFrame,
131
  anomalies: List[Dict[str, Any]]
132
  ) -> str:
133
- # Convert sample rows & anomalies to compact text
134
- sample_text = sample_rows.to_markdown(index=False) if not sample_rows.empty else "No matching rows."
 
 
 
135
  anomalies_text = json.dumps(anomalies, indent=2) if anomalies else "[]"
136
 
137
  prompt = SUMMARY_SYSTEM_PROMPT + "\n\n"
@@ -144,7 +147,7 @@ def generate_summary(
144
  return generate_llm(prompt, max_new_tokens=512, temperature=0.2)
145
 
146
 
147
- # ---------- CSV FILTER & ANOMALY ENGINE ----------
148
 
149
  def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
150
  df = df.copy()
@@ -153,10 +156,6 @@ def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
153
 
154
 
155
  def basic_time_filter(df: pd.DataFrame, time_range: str) -> pd.DataFrame:
156
- """
157
- Expect a 'timestamp' column in a parseable datetime format.
158
- For demo, support a few simple ranges; otherwise return df.
159
- """
160
  if "timestamp" not in df.columns:
161
  return df
162
 
@@ -182,14 +181,10 @@ def basic_time_filter(df: pd.DataFrame, time_range: str) -> pd.DataFrame:
182
  cutoff = now - pd.Timedelta(days=30)
183
  return df[df["timestamp"] >= cutoff]
184
  else:
185
- # Unknown text → just return df for MVP
186
  return df
187
 
188
 
189
  def basic_user_filter(df: pd.DataFrame, users: Any) -> pd.DataFrame:
190
- """
191
- Expect 'user' or 'username' or 'scientist' column.
192
- """
193
  df = df.copy()
194
  user_col = None
195
  for cand in ["user", "username", "scientist", "employee"]:
@@ -206,44 +201,38 @@ def basic_user_filter(df: pd.DataFrame, users: Any) -> pd.DataFrame:
206
  users = [users]
207
 
208
  users_norm = [u.strip().lower() for u in users]
209
- return df[df[user_col].str.lower().isin(users_norm)]
210
 
211
 
212
  def detect_anomalies(
213
  df: pd.DataFrame,
214
  focus: str = "general"
215
  ) -> List[Dict[str, Any]]:
216
- """
217
- Very simple rule-based anomaly engine for demo.
218
- Expectations:
219
- - 'timestamp' datetime column
220
- - 'status' or 'result' for failures
221
- - 'system' or 'application' column
222
- - 'country' or 'location' for impossible travel (demo-level)
223
- """
224
  anomalies: List[Dict[str, Any]] = []
225
  if df.empty:
226
  return anomalies
227
 
228
- # Ensure needed columns exist
229
  df = df.copy()
230
  if "timestamp" in df.columns:
231
  df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
232
 
233
  # 1) Login failures
234
  if focus in ["general", "login_failures"]:
235
- # interpret failed rows
236
  fail_mask = False
237
  for col in ["status", "result", "action"]:
238
  if col in df.columns:
239
  fail_mask = fail_mask | df[col].astype(str).str.lower().str.contains("fail")
240
  failed = df[fail_mask]
241
  if not failed.empty:
242
- by_user = failed.groupby(df.columns[df.columns.str.contains("user|scientist|employee")][0]) \
243
- if df.columns.str.contains("user|scientist|employee").any() else None
244
- if by_user is not None:
 
 
 
 
245
  for user, group in by_user:
246
- if len(group) >= 3: # threshold
247
  anomalies.append({
248
  "type": "login_failures",
249
  "user": str(user),
@@ -251,7 +240,7 @@ def detect_anomalies(
251
  "details": f"{len(group)} failed events found for {user}"
252
  })
253
 
254
- # 2) Off-hours access (after 23:00 or before 06:00)
255
  if "timestamp" in df.columns and focus in ["general", "off_hours"]:
256
  df["hour"] = df["timestamp"].dt.hour
257
  off = df[(df["hour"] >= 23) | (df["hour"] < 6)]
@@ -273,7 +262,6 @@ def detect_anomalies(
273
 
274
  # 3) Many systems in a day (>= 5)
275
  if focus in ["general", "many_systems"]:
276
- # Need user + system
277
  user_col = None
278
  for cand in ["user", "username", "scientist", "employee"]:
279
  if cand in df.columns:
@@ -297,7 +285,7 @@ def detect_anomalies(
297
  "details": f"Accessed {row['system_count']} systems on {row['date']}"
298
  })
299
 
300
- # 4) Impossible travel – very rough demo (same user, two countries same day)
301
  if focus in ["general", "impossible_travel"]:
302
  user_col = None
303
  for cand in ["user", "username", "scientist", "employee"]:
@@ -313,14 +301,14 @@ def detect_anomalies(
313
  df["date"] = df["timestamp"].dt.date
314
  grouped = df.groupby([user_col, "date"])
315
  for (user, date), group in grouped:
316
- countries = group[loc_col].astype(str).str.strip().str.lower().unique()
317
- if len(countries) >= 2:
318
  anomalies.append({
319
  "type": "impossible_travel",
320
  "user": str(user),
321
  "date": str(date),
322
- "locations": list(map(str, countries)),
323
- "details": f"Multiple locations {countries} in single day"
324
  })
325
 
326
  return anomalies
@@ -330,9 +318,6 @@ def apply_intent_to_dataframe(
330
  df: pd.DataFrame,
331
  intent: Dict[str, Any]
332
  ) -> Tuple[pd.DataFrame, List[Dict[str, Any]], str]:
333
- """
334
- Return: (filtered_df, anomalies, filter_description)
335
- """
336
  df = normalize_column_names(df)
337
  action = intent.get("action", "run_log_query")
338
  params = intent.get("parameters", {})
@@ -340,7 +325,6 @@ def apply_intent_to_dataframe(
340
  time_range = params.get("time_range", "all_time")
341
  focus = params.get("focus", "general")
342
 
343
- # Basic filters
344
  filtered = basic_time_filter(df, time_range)
345
  filtered = basic_user_filter(filtered, users)
346
 
@@ -353,23 +337,77 @@ def apply_intent_to_dataframe(
353
  return filtered, anomalies, filter_desc
354
 
355
 
356
- # ---------- GRADIO UI LOGIC ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
  DESCRIPTION_MD = """
359
  # 🔍 Smart Log Copilot (CSV Demo)
360
 
361
- **Use case:** Pharma / corporate security teams analyzing login & access logs.
362
 
363
  1. Upload a **CSV log file** (with columns like `timestamp`, `user`, `system`, `status`, `country`, etc.)
364
  2. Ask questions in **plain English**, e.g.:
365
  - *"Was Dr. Rao doing anything suspicious this week?"*
366
- - *"Show night-time logins from any scientist."*
367
- - *"Who accessed too many systems in a single day?"*
368
  3. The app will:
369
  - Interpret your question via a local LLM (Qwen 1.5B)
370
  - Filter & analyse the CSV with Pandas
371
- - Run simple anomaly rules (off-hours, failures, many systems, impossible travel)
372
- - Return an easy-to-read summary + recommendations
373
 
374
  > For demo: a **placeholder anomaly screenshot** is shown whenever anomalies are found.
375
  """
@@ -377,38 +415,25 @@ DESCRIPTION_MD = """
377
  PLACEHOLDER_IMAGE_URL = "https://dummyimage.com/600x300/ff0000/ffffff&text=Anomaly+Screenshot+Placeholder"
378
 
379
 
380
- def load_csv(file_obj) -> Tuple[pd.DataFrame, str]:
 
 
381
  if file_obj is None:
382
- return pd.DataFrame(), "No file uploaded yet."
383
  try:
384
  df = pd.read_csv(file_obj.name)
385
  df = normalize_column_names(df)
386
  info = f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns."
387
- return df, info
388
  except Exception as e:
389
- return pd.DataFrame(), f"Error loading CSV: {e}"
390
-
391
-
392
- def chat_logic(
393
- user_message: str,
394
- history: List[List[str]],
395
- df_state: Optional[pd.DataFrame]
396
- ) -> Tuple[str, str]:
397
- """
398
- Main chat handler.
399
- Returns: (assistant_reply, anomaly_image_or_empty)
400
- """
401
- if df_state is None or df_state.empty:
402
- return "Please upload a CSV file with logs first.", ""
403
-
404
- # 1) Extract intent from LLM
405
- intent = extract_intent(user_message)
406
 
407
- # 2) Apply intent to dataframe → filter + anomaly detection
 
408
  filtered_df, anomalies, filter_desc = apply_intent_to_dataframe(df_state, intent)
409
 
410
- # 3) Prepare summary using LLM
411
- sample = filtered_df.head(30) # small sample
412
  summary = generate_summary(
413
  user_question=user_message,
414
  filter_description=filter_desc,
@@ -416,18 +441,62 @@ def chat_logic(
416
  anomalies=anomalies
417
  )
418
 
419
- # 4) If anomalies exist, show placeholder screenshot
420
- anomaly_image = PLACEHOLDER_IMAGE_URL if anomalies else ""
 
421
 
422
- return summary, anomaly_image
 
 
423
 
 
 
 
 
424
 
425
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  gr.Markdown(DESCRIPTION_MD)
427
 
428
  with gr.Row():
429
  with gr.Column(scale=2):
430
  file_input = gr.File(label="Upload CSV log file", file_types=[".csv"])
 
431
  load_info = gr.Markdown("No file loaded.")
432
  with gr.Column(scale=3):
433
  df_preview = gr.Dataframe(
@@ -439,11 +508,9 @@ with gr.Blocks() as demo:
439
  df_state = gr.State(pd.DataFrame())
440
 
441
  def on_load_csv(file_obj):
442
- df, info = load_csv(file_obj)
443
- preview = df.head(20) if not df.empty else pd.DataFrame()
444
  return df, preview, info
445
 
446
- load_btn = gr.Button("Load CSV")
447
  load_btn.click(
448
  fn=on_load_csv,
449
  inputs=[file_input],
@@ -451,49 +518,56 @@ with gr.Blocks() as demo:
451
  )
452
 
453
  gr.Markdown("---")
454
- gr.Markdown("### 💬 Ask questions about the uploaded logs")
455
 
456
  with gr.Row():
457
  with gr.Column(scale=3):
458
- chatbot = gr.Chatbot(label="Smart Log Copilot")
 
 
 
459
  msg = gr.Textbox(
460
- label="Your question",
461
- placeholder="e.g. Was anyone logging in from outside India at night?",
462
  lines=2
463
  )
464
- send_btn = gr.Button("Send")
465
  with gr.Column(scale=2):
466
  anomaly_image = gr.Image(
467
  label="Anomaly Screenshot (placeholder)",
468
- value=None,
469
  visible=False
470
  )
 
 
 
 
 
 
471
 
472
- def on_user_message(user_message, chat_history, df):
473
- reply, img = chat_logic(user_message, chat_history, df)
474
- chat_history = chat_history + [[user_message, reply]]
475
- # Show image only if URL returned
476
- if img:
477
- return chat_history, gr.update(value=img, visible=True)
478
- else:
479
- return chat_history, gr.update(visible=False)
480
 
481
  send_btn.click(
482
  fn=on_user_message,
483
  inputs=[msg, chatbot, df_state],
484
- outputs=[chatbot, anomaly_image]
485
  )
486
 
487
  msg.submit(
488
  fn=on_user_message,
489
  inputs=[msg, chatbot, df_state],
490
- outputs=[chatbot, anomaly_image]
 
 
 
 
 
 
491
  )
492
 
493
  gr.Markdown(
494
  """
495
- **Tip:** Prepare a demo CSV with columns like:
496
- `timestamp, user, system, status, country, ip, device`
497
  and deliberately add:
498
  - multiple failed logins,
499
  - some late-night logins,
 
1
  import json
 
2
  from typing import Any, Dict, List, Optional, Tuple
3
+ from io import BytesIO
4
+ import tempfile
5
 
6
  import gradio as gr
7
  import pandas as pd
8
+ import matplotlib
9
+ matplotlib.use("Agg")
10
+ import matplotlib.pyplot as plt
11
+
12
  from transformers import AutoTokenizer, AutoModelForCausalLM
13
+ from fpdf import FPDF
14
 
15
+ # ------------------ MODEL LOADING ------------------
16
 
17
  MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
18
 
 
24
  torch_dtype="auto"
25
  )
26
 
27
+ # ------------------ LLM HELPERS ------------------
 
28
 
29
  def generate_llm(
30
  prompt: str,
31
  max_new_tokens: int = 512,
32
  temperature: float = 0.1
33
  ) -> str:
 
34
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
35
  outputs = model.generate(
36
  **inputs,
 
40
  pad_token_id=tokenizer.eos_token_id
41
  )
42
  full = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
43
  return full[len(prompt):].strip()
44
 
45
 
 
78
  and parameters filled with "any"/"all_time"/"general".
79
  """
80
 
 
81
  def extract_intent(user_message: str) -> Dict[str, Any]:
 
82
  user_block = f'USER_QUESTION: "{user_message}"\n\nReturn ONLY the JSON object now:'
83
  prompt = INTENT_SYSTEM_PROMPT + "\n" + user_block
84
  raw = generate_llm(prompt, max_new_tokens=256, temperature=0.1)
85
 
 
86
  try:
87
  first = raw.find("{")
88
  last = raw.rfind("}")
 
92
  raw_json = raw
93
  data = json.loads(raw_json)
94
  except Exception:
 
95
  data = {
96
  "action": "run_log_query",
97
  "parameters": {
 
130
  sample_rows: pd.DataFrame,
131
  anomalies: List[Dict[str, Any]]
132
  ) -> str:
133
+ if not sample_rows.empty:
134
+ sample_text = sample_rows.to_markdown(index=False)
135
+ else:
136
+ sample_text = "No matching rows."
137
+
138
  anomalies_text = json.dumps(anomalies, indent=2) if anomalies else "[]"
139
 
140
  prompt = SUMMARY_SYSTEM_PROMPT + "\n\n"
 
147
  return generate_llm(prompt, max_new_tokens=512, temperature=0.2)
148
 
149
 
150
+ # ------------------ CSV & ANOMALY ENGINE ------------------
151
 
152
  def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
153
  df = df.copy()
 
156
 
157
 
158
  def basic_time_filter(df: pd.DataFrame, time_range: str) -> pd.DataFrame:
 
 
 
 
159
  if "timestamp" not in df.columns:
160
  return df
161
 
 
181
  cutoff = now - pd.Timedelta(days=30)
182
  return df[df["timestamp"] >= cutoff]
183
  else:
 
184
  return df
185
 
186
 
187
  def basic_user_filter(df: pd.DataFrame, users: Any) -> pd.DataFrame:
 
 
 
188
  df = df.copy()
189
  user_col = None
190
  for cand in ["user", "username", "scientist", "employee"]:
 
201
  users = [users]
202
 
203
  users_norm = [u.strip().lower() for u in users]
204
+ return df[df[user_col].astype(str).str.lower().isin(users_norm)]
205
 
206
 
207
  def detect_anomalies(
208
  df: pd.DataFrame,
209
  focus: str = "general"
210
  ) -> List[Dict[str, Any]]:
 
 
 
 
 
 
 
 
211
  anomalies: List[Dict[str, Any]] = []
212
  if df.empty:
213
  return anomalies
214
 
 
215
  df = df.copy()
216
  if "timestamp" in df.columns:
217
  df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
218
 
219
  # 1) Login failures
220
  if focus in ["general", "login_failures"]:
 
221
  fail_mask = False
222
  for col in ["status", "result", "action"]:
223
  if col in df.columns:
224
  fail_mask = fail_mask | df[col].astype(str).str.lower().str.contains("fail")
225
  failed = df[fail_mask]
226
  if not failed.empty:
227
+ user_col = None
228
+ for cand in ["user", "username", "scientist", "employee"]:
229
+ if cand in df.columns:
230
+ user_col = cand
231
+ break
232
+ if user_col:
233
+ by_user = failed.groupby(user_col)
234
  for user, group in by_user:
235
+ if len(group) >= 3:
236
  anomalies.append({
237
  "type": "login_failures",
238
  "user": str(user),
 
240
  "details": f"{len(group)} failed events found for {user}"
241
  })
242
 
243
+ # 2) Off-hours (23:0006:00)
244
  if "timestamp" in df.columns and focus in ["general", "off_hours"]:
245
  df["hour"] = df["timestamp"].dt.hour
246
  off = df[(df["hour"] >= 23) | (df["hour"] < 6)]
 
262
 
263
  # 3) Many systems in a day (>= 5)
264
  if focus in ["general", "many_systems"]:
 
265
  user_col = None
266
  for cand in ["user", "username", "scientist", "employee"]:
267
  if cand in df.columns:
 
285
  "details": f"Accessed {row['system_count']} systems on {row['date']}"
286
  })
287
 
288
+ # 4) Impossible travel – same user, 2 locations in same day
289
  if focus in ["general", "impossible_travel"]:
290
  user_col = None
291
  for cand in ["user", "username", "scientist", "employee"]:
 
301
  df["date"] = df["timestamp"].dt.date
302
  grouped = df.groupby([user_col, "date"])
303
  for (user, date), group in grouped:
304
+ locations = group[loc_col].astype(str).str.strip().str.lower().unique()
305
+ if len(locations) >= 2:
306
  anomalies.append({
307
  "type": "impossible_travel",
308
  "user": str(user),
309
  "date": str(date),
310
+ "locations": list(map(str, locations)),
311
+ "details": f"Multiple locations {list(locations)} in single day"
312
  })
313
 
314
  return anomalies
 
318
  df: pd.DataFrame,
319
  intent: Dict[str, Any]
320
  ) -> Tuple[pd.DataFrame, List[Dict[str, Any]], str]:
 
 
 
321
  df = normalize_column_names(df)
322
  action = intent.get("action", "run_log_query")
323
  params = intent.get("parameters", {})
 
325
  time_range = params.get("time_range", "all_time")
326
  focus = params.get("focus", "general")
327
 
 
328
  filtered = basic_time_filter(df, time_range)
329
  filtered = basic_user_filter(filtered, users)
330
 
 
337
  return filtered, anomalies, filter_desc
338
 
339
 
340
+ def calculate_risk_score(anomalies: List[Dict[str, Any]]):
341
+ if not anomalies:
342
+ return "🟢", "Low", 0
343
+ count = len(anomalies)
344
+ if count <= 2:
345
+ return "🟡", "Medium", count
346
+ return "🔴", "High", count
347
+
348
+
349
+ def generate_bar_chart(df: pd.DataFrame):
350
+ if df.empty or "system" not in df.columns:
351
+ return None
352
+ fig, ax = plt.subplots(figsize=(6, 3))
353
+ data = df["system"].value_counts()
354
+ ax.bar(data.index, data.values)
355
+ ax.set_title("Events per System")
356
+ ax.set_xlabel("System")
357
+ ax.set_ylabel("Events")
358
+ plt.xticks(rotation=20)
359
+ fig.tight_layout()
360
+ return fig
361
+
362
+
363
+ def build_pdf_report(summary_text, anomalies, risk_icon, risk_label):
364
+ pdf = FPDF()
365
+ pdf.add_page()
366
+ pdf.set_font("Arial", size=12)
367
+
368
+ pdf.multi_cell(0, 10, "Security Report – Smart Log Copilot", align="L")
369
+ pdf.ln(2)
370
+ pdf.multi_cell(0, 10, f"Risk Level: {risk_icon} {risk_label}", align="L")
371
+ pdf.ln(5)
372
+
373
+ pdf.set_font("Arial", size=11)
374
+ pdf.multi_cell(0, 7, "Summary:", align="L")
375
+ pdf.set_font("Arial", size=10)
376
+ pdf.multi_cell(0, 6, summary_text)
377
+ pdf.ln(5)
378
+
379
+ pdf.set_font("Arial", size=11)
380
+ pdf.multi_cell(0, 7, "Detected Anomalies:", align="L")
381
+ pdf.set_font("Arial", size=10)
382
+ if anomalies:
383
+ for an in anomalies:
384
+ line = f"- {an.get('type', '')}: {an.get('details', '')}"
385
+ pdf.multi_cell(0, 6, line)
386
+ else:
387
+ pdf.multi_cell(0, 6, "No anomalies detected.")
388
+
389
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
390
+ pdf.output(tmp.name)
391
+ return tmp.name
392
+
393
+
394
+ # ------------------ DEMO DESCRIPTION ------------------
395
 
396
  DESCRIPTION_MD = """
397
  # 🔍 Smart Log Copilot (CSV Demo)
398
 
399
+ **Use case:** Pharma / corporate security teams analysing login & access logs.
400
 
401
  1. Upload a **CSV log file** (with columns like `timestamp`, `user`, `system`, `status`, `country`, etc.)
402
  2. Ask questions in **plain English**, e.g.:
403
  - *"Was Dr. Rao doing anything suspicious this week?"*
404
+ - *"Who logged in late at night?"*
405
+ - *"Who accessed too many systems in a day?"*
406
  3. The app will:
407
  - Interpret your question via a local LLM (Qwen 1.5B)
408
  - Filter & analyse the CSV with Pandas
409
+ - Run anomaly rules (off-hours, failures, many systems, impossible travel)
410
+ - Return an easy-to-read summary + risk level + optional PDF report.
411
 
412
  > For demo: a **placeholder anomaly screenshot** is shown whenever anomalies are found.
413
  """
 
415
  PLACEHOLDER_IMAGE_URL = "https://dummyimage.com/600x300/ff0000/ffffff&text=Anomaly+Screenshot+Placeholder"
416
 
417
 
418
+ # ------------------ CORE CHAT LOGIC ------------------
419
+
420
+ def load_csv(file_obj):
421
  if file_obj is None:
422
+ return pd.DataFrame(), pd.DataFrame(), "No file uploaded yet."
423
  try:
424
  df = pd.read_csv(file_obj.name)
425
  df = normalize_column_names(df)
426
  info = f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns."
427
+ return df, df.head(20), info
428
  except Exception as e:
429
+ return pd.DataFrame(), pd.DataFrame(), f"Error loading CSV: {e}"
430
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
+ def chat_logic(user_message: str, df_state: pd.DataFrame):
433
+ intent = extract_intent(user_message)
434
  filtered_df, anomalies, filter_desc = apply_intent_to_dataframe(df_state, intent)
435
 
436
+ sample = filtered_df.head(30)
 
437
  summary = generate_summary(
438
  user_question=user_message,
439
  filter_description=filter_desc,
 
441
  anomalies=anomalies
442
  )
443
 
444
+ img = PLACEHOLDER_IMAGE_URL if anomalies else ""
445
+ return summary, img, filtered_df, anomalies
446
+
447
 
448
+ def on_user_message(user_message, chat_history, df):
449
+ # Append user message
450
+ chat_history = chat_history + [{"role": "user", "content": user_message}]
451
 
452
+ if df is None or df.empty:
453
+ reply = "📂 Please upload a CSV file with logs first."
454
+ chat_history = chat_history + [{"role": "assistant", "content": reply}]
455
+ return chat_history, gr.update(visible=False), gr.update(visible=False), None
456
 
457
+ summary_text, img, filtered_df, anomalies = chat_logic(user_message, df)
458
+
459
+ risk_icon, risk_label, _ = calculate_risk_score(anomalies)
460
+ reply_text = f"{risk_icon} **Risk Level: {risk_label}**\n\n" + summary_text
461
+
462
+ chat_history = chat_history + [{"role": "assistant", "content": reply_text}]
463
+
464
+ # Chart
465
+ fig = generate_bar_chart(filtered_df)
466
+ if fig is not None:
467
+ chart_update = gr.update(value=fig, visible=True)
468
+ else:
469
+ chart_update = gr.update(visible=False)
470
+
471
+ # Report meta state
472
+ report_meta = (reply_text, anomalies, risk_icon, risk_label)
473
+
474
+ # Screenshot
475
+ if img:
476
+ img_update = gr.update(value=img, visible=True)
477
+ else:
478
+ img_update = gr.update(visible=False)
479
+
480
+ return chat_history, img_update, chart_update, report_meta
481
+
482
+
483
+ def on_generate_report(report_meta):
484
+ if not report_meta:
485
+ return gr.update(visible=False)
486
+ summary_text, anomalies, risk_icon, risk_label = report_meta
487
+ pdf_path = build_pdf_report(summary_text, anomalies, risk_icon, risk_label)
488
+ return gr.update(value=pdf_path, visible=True)
489
+
490
+
491
+ # ------------------ GRADIO UI ------------------
492
+
493
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", neutral_hue="gray")) as demo:
494
  gr.Markdown(DESCRIPTION_MD)
495
 
496
  with gr.Row():
497
  with gr.Column(scale=2):
498
  file_input = gr.File(label="Upload CSV log file", file_types=[".csv"])
499
+ load_btn = gr.Button("Load CSV")
500
  load_info = gr.Markdown("No file loaded.")
501
  with gr.Column(scale=3):
502
  df_preview = gr.Dataframe(
 
508
  df_state = gr.State(pd.DataFrame())
509
 
510
  def on_load_csv(file_obj):
511
+ df, preview, info = load_csv(file_obj)
 
512
  return df, preview, info
513
 
 
514
  load_btn.click(
515
  fn=on_load_csv,
516
  inputs=[file_input],
 
518
  )
519
 
520
  gr.Markdown("---")
521
+ gr.Markdown("### 💬 Smart Log Copilot")
522
 
523
  with gr.Row():
524
  with gr.Column(scale=3):
525
+ chatbot = gr.Chatbot(
526
+ label=None,
527
+ type="messages",
528
+ )
529
  msg = gr.Textbox(
530
+ placeholder="Ask a question like: Who logged in late at night?",
531
+ show_label=False,
532
  lines=2
533
  )
534
+ send_btn = gr.Button("Send", variant="primary")
535
  with gr.Column(scale=2):
536
  anomaly_image = gr.Image(
537
  label="Anomaly Screenshot (placeholder)",
 
538
  visible=False
539
  )
540
+ chart_plot = gr.Plot(
541
+ label="Log Activity Chart",
542
+ visible=False
543
+ )
544
+ report_btn = gr.Button("Generate PDF Report", variant="secondary")
545
+ pdf_file = gr.File(label="Download Security Report", visible=False)
546
 
547
+ report_state = gr.State()
 
 
 
 
 
 
 
548
 
549
  send_btn.click(
550
  fn=on_user_message,
551
  inputs=[msg, chatbot, df_state],
552
+ outputs=[chatbot, anomaly_image, chart_plot, report_state]
553
  )
554
 
555
  msg.submit(
556
  fn=on_user_message,
557
  inputs=[msg, chatbot, df_state],
558
+ outputs=[chatbot, anomaly_image, chart_plot, report_state]
559
+ )
560
+
561
+ report_btn.click(
562
+ fn=on_generate_report,
563
+ inputs=[report_state],
564
+ outputs=[pdf_file]
565
  )
566
 
567
  gr.Markdown(
568
  """
569
+ **Tip:** Use a demo CSV with columns like:
570
+ `timestamp, user, system, status, country`
571
  and deliberately add:
572
  - multiple failed logins,
573
  - some late-night logins,