PatrickRedStar commited on
Commit
29fdac9
·
1 Parent(s): b60b1ca
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/*
README.md CHANGED
@@ -1,12 +1,68 @@
1
- ---
2
- title: Logreader
3
- emoji: 🐠
4
- colorFrom: yellow
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 6.1.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Log Compiler App
2
+
3
+ Gradio demo that ingests raw logs/stacktraces, classifies the incident type, paraphrases to human language, retrieves local runbooks, and proposes checks. The pipeline uses multiple Hugging Face transformer models (zero-shot classifier, summarizer, sentence-embedding retriever; optional reranker and NLI verifier).
4
+
5
+ ## Setup
6
+
7
+ 1. Python 3.10+ recommended.
8
+ 2. Install deps (downloads models on first run):
9
+ ```bash
10
+ pip install -r requirements.txt
11
+ ```
12
+
13
+ ## Run
14
+
15
+ ```bash
16
+ python app.py
17
+ ```
18
+
19
+ Gradio UI will open in the browser. Models load once at startup.
20
+
21
+ Если localhost недоступен (WSL/прокси), приложение по умолчанию включает share-ссылку. Чтобы явно управлять:
22
+ ```bash
23
+ # форсировать публичный линк
24
+ GRADIO_SHARE=1 python app.py
25
+ # отключить share (если localhost доступен)
26
+ GRADIO_SHARE=0 python app.py
27
+ ```
28
+ Запуск уже включает `server_name=0.0.0.0`.
29
+
30
+ ## Запуск на Hugging Face Spaces
31
+
32
+ - Выложите содержимое репозитория в новый Space (Gradio).
33
+ - `app.py` автоматически отключит `share` в окружении Spaces (`SPACE_ID`/`HF_SPACE`), так что дополнительная настройка не нужна.
34
+ - Заводываются зависимости из `requirements.txt` автоматически. При необходимости можно добавить `runtime.txt` с версией Python (например, `python-3.10`).
35
+
36
+ ## How to use
37
+
38
+ - Paste logs/stacktrace in the left text box.
39
+ - Pick a source (auto/python/java/node/k8s).
40
+ - Toggle:
41
+ - `Use retrieval (local KB)` to search `kb/` runbooks.
42
+ - `Verify hypothesis (NLI)` to check entailment against the logs.
43
+ - Adjust verbosity (0-2) for explanation detail.
44
+ - Click **Analyze**. Use **Generate ticket template** to get a pre-filled ticket, and **Export JSON** to download results.
45
+
46
+ ## Samples
47
+
48
+ Try the provided snippets:
49
+
50
+ - `samples/sample_python.txt` – HTTP timeout stacktrace.
51
+ - `samples/sample_k8s.txt` – CrashLoop/OOMKilled pod.
52
+ - `samples/sample_java.txt` – NullPointerException auth failure.
53
+
54
+ ## Files
55
+
56
+ - `app.py` – Gradio UI wiring.
57
+ - `pipeline.py` – ML pipeline (classification, summarization, retrieval, NLI).
58
+ - `preprocess.py` – Masking, signature detection, safe truncation.
59
+ - `retrieval.py` – Embedding search over `kb/` markdown runbooks.
60
+ - `kb/` – Local runbooks (edit/add your own).
61
+ - `samples/` – Example logs to paste.
62
+
63
+ ## Notes
64
+
65
+ - First run downloads models (`facebook/bart-large-mnli`, `sshleifer/distilbart-cnn-12-6`, `sentence-transformers/all-MiniLM-L6-v2`, optional reranker `cross-encoder/ms-marco-MiniLM-L-6-v2`, and NLI `typeform/distilbert-base-uncased-mnli`).
66
+ - Handles empty/short input with a clear error.
67
+ - Output tabs: Incident Type, Human Explanation, Likely Cause + Checks, Retrieved Runbooks, Verification, Ticket Template.
68
+ - Pin `numpy<2.0` to avoid ABI conflicts with PyTorch/Transformers wheels. If you already installed numpy 2.x, run `pip install 'numpy<2' --upgrade`.
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import tempfile
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import gradio as gr
7
+
8
+ from pipeline import IncidentPipeline, IncidentResult, serialize_result
9
+ from preprocess import truncate_logs
10
+
11
+
12
+ pipeline = IncidentPipeline()
13
+
14
+
15
+ def format_incident_section(result: IncidentResult) -> str:
16
+ alt_text = ", ".join(f"{a['label']} ({a['score']:.2f})" for a in result.incident_alternatives)
17
+ sigs = ", ".join(result.signatures) if result.signatures else "none"
18
+ return (
19
+ f"**Incident:** {result.incident_label} (confidence {result.incident_score:.2f})\n\n"
20
+ f"**Top alternatives:** {alt_text if alt_text else 'n/a'}\n\n"
21
+ f"**Detected signatures:** {sigs}"
22
+ )
23
+
24
+
25
+ def format_cause_section(result: IncidentResult) -> str:
26
+ checks_md = "\n".join([f"- {c}" for c in result.checks])
27
+ return f"**Likely cause:** {result.likely_cause}\n\n**Checks / next steps:**\n{checks_md}"
28
+
29
+
30
+ def analyze_logs(logs: str, source: str, use_retrieval: bool, use_nli: bool, verbosity: int):
31
+ try:
32
+ res = pipeline.process(
33
+ logs,
34
+ source=source,
35
+ use_retrieval=use_retrieval,
36
+ use_nli=use_nli,
37
+ verbosity=verbosity,
38
+ )
39
+ except Exception as exc:
40
+ message = f"Error: {exc}"
41
+ empty_table: List[List[Any]] = []
42
+ return (
43
+ message,
44
+ "",
45
+ "",
46
+ empty_table,
47
+ empty_table,
48
+ None,
49
+ f"Failed: {exc}",
50
+ )
51
+
52
+ retrieval_rows = [
53
+ [r["title"], round(r["score"], 3), r["path"], r["excerpt"]]
54
+ for r in res.retrieved
55
+ ]
56
+ verification_rows = [
57
+ [v["hypothesis"], v["label"], round(v["score"], 3)] for v in res.verification
58
+ ]
59
+ return (
60
+ format_incident_section(res),
61
+ res.explanation,
62
+ format_cause_section(res),
63
+ retrieval_rows,
64
+ verification_rows,
65
+ res,
66
+ "Analysis completed.",
67
+ )
68
+
69
+
70
+ def ticket_template(state: Optional[IncidentResult], logs: str) -> str:
71
+ if state is None:
72
+ return "Run analysis first."
73
+ clipped_logs = truncate_logs(logs, head_lines=30, tail_lines=10, max_lines=60)
74
+ checks_md = "\n".join(f"- {c}" for c in state.checks)
75
+ summary = f"{state.incident_label} — {state.explanation[:180]}"
76
+ template = (
77
+ f"Summary:\n{summary}\n\n"
78
+ f"Steps to reproduce:\n- Describe sequence leading to error (fill in).\n- Attach failing request/sample data.\n\n"
79
+ f"Expected:\n- Service handles request successfully.\n\n"
80
+ f"Actual:\n- {state.likely_cause}\n\n"
81
+ f"Checks performed / next steps:\n{checks_md}\n\n"
82
+ f"Logs snippet:\n{clipped_logs}\n"
83
+ )
84
+ return template
85
+
86
+
87
+ def export_json(state: Optional[IncidentResult]):
88
+ if state is None:
89
+ return None
90
+ data = serialize_result(state)
91
+ tmp = tempfile.NamedTemporaryFile("w", delete=False, suffix=".json", encoding="utf-8")
92
+ tmp.write(data)
93
+ tmp.flush()
94
+ tmp.close()
95
+ return tmp.name
96
+
97
+
98
+ with gr.Blocks(title="Log Compiler App") as demo:
99
+ gr.Markdown("# Log Compiler App\nPaste logs/stacktrace to get incident classification, explanations, and runbook suggestions.")
100
+ state = gr.State()
101
+
102
+ with gr.Row():
103
+ with gr.Column(scale=1):
104
+ logs_input = gr.Textbox(lines=20, label="Logs / Stacktrace", placeholder="Paste logs here...")
105
+ source_dropdown = gr.Dropdown(
106
+ ["auto", "python", "java", "node", "k8s"],
107
+ value="auto",
108
+ label="Source",
109
+ )
110
+ use_retrieval = gr.Checkbox(value=True, label="Use retrieval (local KB)")
111
+ use_nli = gr.Checkbox(value=False, label="Verify hypothesis (NLI)")
112
+ verbosity_slider = gr.Slider(0, 2, value=1, step=1, label="Verbosity")
113
+ analyze_btn = gr.Button("Analyze")
114
+ ticket_btn = gr.Button("Generate ticket template")
115
+ export_btn = gr.Button("Export JSON")
116
+ json_output = gr.File(label="JSON export")
117
+ status = gr.Markdown("Ready.")
118
+ with gr.Column(scale=1.2):
119
+ with gr.Tab("Incident Type"):
120
+ incident_md = gr.Markdown()
121
+ with gr.Tab("Human Explanation"):
122
+ explanation_md = gr.Markdown()
123
+ with gr.Tab("Likely Cause + Checks"):
124
+ cause_md = gr.Markdown()
125
+ with gr.Tab("Retrieved Runbooks"):
126
+ retrieval_df = gr.Dataframe(
127
+ headers=["Title", "Score", "Path", "Excerpt"],
128
+ datatype=["str", "number", "str", "str"],
129
+ interactive=False,
130
+ )
131
+ with gr.Tab("Verification"):
132
+ verification_df = gr.Dataframe(
133
+ headers=["Hypothesis", "Label", "Score"],
134
+ datatype=["str", "str", "number"],
135
+ interactive=False,
136
+ )
137
+ with gr.Tab("Ticket Template"):
138
+ ticket_md = gr.Markdown()
139
+
140
+ analyze_btn.click(
141
+ fn=analyze_logs,
142
+ inputs=[logs_input, source_dropdown, use_retrieval, use_nli, verbosity_slider],
143
+ outputs=[incident_md, explanation_md, cause_md, retrieval_df, verification_df, state, status],
144
+ )
145
+
146
+ ticket_btn.click(
147
+ fn=ticket_template,
148
+ inputs=[state, logs_input],
149
+ outputs=ticket_md,
150
+ )
151
+
152
+ export_btn.click(
153
+ fn=export_json,
154
+ inputs=state,
155
+ outputs=json_output,
156
+ )
157
+
158
+
159
+ if __name__ == "__main__":
160
+ share_env = os.getenv("GRADIO_SHARE")
161
+ in_hf_space = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE"))
162
+ # In Spaces we do not need share=True; locally default to share to bypass localhost issues.
163
+ if in_hf_space:
164
+ share_flag = False
165
+ else:
166
+ share_flag = True if share_env is None else share_env.lower() in ("1", "true", "yes")
167
+ demo.launch(server_name="0.0.0.0", share=share_flag)
kb/auth_401_403.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auth 401/403
2
+
3
+ ## Symptoms
4
+ - API returns 401/403 for valid requests
5
+ - `Invalid token`, `permission denied`, or `signature mismatch` in logs
6
+ - Clock skew errors in authentication service
7
+
8
+ ## Checks
9
+ - Validate access token expiration and issuer
10
+ - Confirm user/service account scopes/roles
11
+ - Check client/server clock skew (NTP)
12
+ - Review recent secret/credential rotations
13
+ - Inspect identity provider availability and rate limits
14
+
15
+ ## Fix
16
+ - Refresh/rotate tokens or credentials
17
+ - Grant correct roles/scopes to caller
18
+ - Align clocks and retry
19
+ - Apply retry/backoff if IDP is throttling
kb/db_connection_pool.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Database Connection Pool Exhaustion
2
+
3
+ ## Symptoms
4
+ - `too many connections`, `connection refused`, or pool timeout errors
5
+ - Spikes in DB connections and wait time
6
+ - Slow queries or lock contention observed
7
+
8
+ ## Checks
9
+ - Inspect application pool size vs database max connections
10
+ - Review slow queries and transaction lengths
11
+ - Check connection leak metrics and proper closing
12
+ - Validate DB host/port/DNS and TLS settings
13
+ - Examine recent traffic/load changes
14
+
15
+ ## Fix
16
+ - Tune pool size and DB max connections
17
+ - Fix connection leaks and ensure pooling is enabled
18
+ - Optimize slow queries; add indexes where needed
19
+ - Scale database or replicas to handle load
kb/dns_failure.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DNS Resolution Failure
2
+
3
+ ## Symptoms
4
+ - `getaddrinfo ENOTFOUND` or `NameResolutionFailure`
5
+ - Transient errors resolving service hostnames
6
+ - Works from one namespace/host but not another
7
+
8
+ ## Checks
9
+ - Resolve target host from pod/host (`nslookup`, `dig`)
10
+ - Inspect `/etc/resolv.conf` search domains and ndots
11
+ - Verify CoreDNS logs for SERVFAIL/REFUSED
12
+ - Check recent DNS changes or missing A/CNAME records
13
+ - Validate network policies allowing DNS traffic
14
+
15
+ ## Fix
16
+ - Correct service/record names and search domains
17
+ - Restart CoreDNS or propagate zone updates
18
+ - Add caching and lower ndots if needed
19
+ - Update network policies to allow UDP/TCP 53
kb/java_null_pointer.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Java NullPointerException
2
+
3
+ ## Symptoms
4
+ - Stacktrace contains `NullPointerException`
5
+ - Fails during specific code paths or after deploy
6
+ - Sometimes triggered by missing config or feature flags
7
+
8
+ ## Checks
9
+ - Inspect stacktrace frames to find source file and line
10
+ - Validate configuration/feature flag defaults are present
11
+ - Add guards for optional fields and log offending values
12
+ - Review recent code changes around the failing area
13
+ - Add unit tests covering null/missing data inputs
14
+
15
+ ## Fix
16
+ - Add null checks and default values
17
+ - Ensure config/flags are loaded before use
18
+ - Deploy fix and monitor for recurrence
kb/k8s_crashloop.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # K8s CrashLoopBackOff
2
+
3
+ ## Symptoms
4
+ - Pod restarts repeatedly with `CrashLoopBackOff`
5
+ - Health checks failing or process exits quickly
6
+ - Logs end abruptly after startup
7
+
8
+ ## Checks
9
+ - Inspect last container logs before restart (`kubectl logs -p`)
10
+ - Validate readiness/liveness probes and command/args
11
+ - Confirm config/secret mounts paths and permissions
12
+ - Check for missing env vars or failing dependency endpoints
13
+ - Review recent image/config deploys
14
+
15
+ ## Fix
16
+ - Correct probes or increase initial delays
17
+ - Fix missing configuration or credentials
18
+ - Add retries/backoff around external dependencies
19
+ - Roll back to last known good release if needed
kb/oom_killed.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OOM / Memory Pressure
2
+
3
+ ## Symptoms
4
+ - Pod terminated with `OOMKilled` or JVM `OutOfMemoryError`
5
+ - Memory usage climbs until eviction
6
+ - Core dump or heap dump generated
7
+
8
+ ## Checks
9
+ - Compare pod/container memory limits vs peak usage
10
+ - Inspect heap/thread dumps for leaks or unbounded caches
11
+ - Validate GC settings and memory flags
12
+ - Check for large payloads or unbounded batching
13
+ - Review sidecar/agent memory consumption
14
+
15
+ ## Fix
16
+ - Right-size memory requests/limits
17
+ - Fix leaks or cap caches/buffers
18
+ - Split large batches/payloads
19
+ - Adjust GC or runtime memory options
kb/timeout_slow_service.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Timeout / Slow Dependency
2
+
3
+ ## Symptoms
4
+ - Requests fail with `timeout` or `deadline exceeded`
5
+ - Upstream latency spikes in metrics
6
+ - Retries triggered with backoff
7
+
8
+ ## Checks
9
+ - Measure RTT and latency between caller and dependency
10
+ - Inspect dependency saturation (CPU, connections, queue depth)
11
+ - Check long-running queries or GC pauses
12
+ - Validate timeout/retry configuration and circuit breakers
13
+ - Review recent deploys or infrastructure changes impacting latency
14
+
15
+ ## Fix
16
+ - Tune timeouts/retries to realistic values
17
+ - Optimize slow queries or handlers
18
+ - Add caching/batching if appropriate
19
+ - Scale dependency horizontally or vertically
kb/tls_handshake.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TLS Handshake Issues
2
+
3
+ ## Symptoms
4
+ - `SSLHandshakeException`, `certificate verify failed`, or `unknown_ca`
5
+ - Works with curl -k but fails with client defaults
6
+ - Errors after certificate rotation
7
+
8
+ ## Checks
9
+ - Validate certificate chain, expiry, and SAN/hostname match
10
+ - Confirm protocol/cipher compatibility between client and server
11
+ - Check ALPN/SNI configuration for proxies or ingress
12
+ - Inspect system trust store and custom CA bundles
13
+ - Review mTLS settings and key/cert presence
14
+
15
+ ## Fix
16
+ - Install correct CA bundle and full certificate chain
17
+ - Align TLS versions/ciphers or disable legacy protocols
18
+ - Configure SNI/ALPN correctly on clients and proxies
19
+ - Rotate certificates/keys and restart workloads
pipeline.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dataclasses import asdict, dataclass
4
+ from typing import Dict, List, Optional
5
+
6
+ # Force CPU usage to avoid CUDA capability issues in WSL/GPU-mismatch environments.
7
+ os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
8
+
9
+ from transformers import pipeline
10
+
11
+ from preprocess import PreprocessResult, preprocess_logs
12
+ from retrieval import RunbookRetriever
13
+
14
+
15
+ CANDIDATE_LABELS = [
16
+ "oom",
17
+ "timeout",
18
+ "auth_failure",
19
+ "db_connection",
20
+ "dns_resolution",
21
+ "tls_handshake",
22
+ "crashloop",
23
+ "null_pointer",
24
+ "resource_exhaustion",
25
+ "network_partition",
26
+ ]
27
+
28
+
29
+ @dataclass
30
+ class IncidentResult:
31
+ incident_label: str
32
+ incident_score: float
33
+ incident_alternatives: List[Dict]
34
+ explanation: str
35
+ likely_cause: str
36
+ checks: List[str]
37
+ retrieved: List[Dict]
38
+ verification: List[Dict]
39
+ signatures: List[str]
40
+
41
+
42
+ class ModelStore:
43
+ def __init__(self):
44
+ self.classifier = pipeline(
45
+ "zero-shot-classification",
46
+ model="facebook/bart-large-mnli",
47
+ device=-1,
48
+ )
49
+ self.summarizer = pipeline(
50
+ "summarization",
51
+ model="sshleifer/distilbart-cnn-12-6",
52
+ device=-1,
53
+ )
54
+ self.nli = pipeline(
55
+ "text-classification",
56
+ model="typeform/distilbert-base-uncased-mnli",
57
+ device=-1,
58
+ )
59
+
60
+
61
+ class IncidentPipeline:
62
+ def __init__(self):
63
+ self.models = ModelStore()
64
+ self.retriever = RunbookRetriever()
65
+
66
+ def classify(self, text: str, source: str) -> Dict:
67
+ labels = list(CANDIDATE_LABELS)
68
+ if source and source != "auto":
69
+ labels.append(f"{source}_specific")
70
+ res = self.models.classifier(text, candidate_labels=labels, multi_label=False)
71
+ label = res["labels"][0]
72
+ score = float(res["scores"][0])
73
+ alternatives = [
74
+ {"label": res["labels"][i], "score": float(res["scores"][i])}
75
+ for i in range(1, min(4, len(res["labels"])))
76
+ ]
77
+ return {"label": label, "score": score, "alternatives": alternatives}
78
+
79
+ def explain(self, text: str, verbosity: int = 1) -> str:
80
+ max_len = 180 + 60 * verbosity
81
+ min_len = 40 + 20 * verbosity
82
+ summary = self.models.summarizer(
83
+ text,
84
+ max_length=max_len,
85
+ min_length=min_len,
86
+ truncation=True,
87
+ )[0]["summary_text"]
88
+ return summary
89
+
90
+ def generate_cause_and_checks(
91
+ self, result: PreprocessResult, label: str, retrieved: List[Dict]
92
+ ) -> tuple[str, List[str]]:
93
+ cause_map = {
94
+ "oom": "Service likely exhausted memory and was terminated.",
95
+ "crashloop": "Container keeps restarting due to repeated failures or failed health checks.",
96
+ "timeout": "Upstream or dependency timed out handling the request.",
97
+ "auth_failure": "Authentication/authorization failed (expired token, missing permissions, or misconfiguration).",
98
+ "db_connection": "Database connection pool exhausted or connection refused.",
99
+ "dns_resolution": "DNS resolution failed for upstream host.",
100
+ "tls_handshake": "TLS handshake failed (bad cert, protocol mismatch).",
101
+ "null_pointer": "Application hit null/None reference and crashed.",
102
+ "resource_exhaustion": "System resources (CPU/file descriptors) exhausted.",
103
+ "network_partition": "Network partition or connectivity issue between components.",
104
+ }
105
+ cause = cause_map.get(label, f"Most likely incident category: {label}.")
106
+ checks: List[str] = [
107
+ "Confirm timeframe of failure in logs and recent deploys.",
108
+ "Check service and pod/resource metrics (CPU, memory, restarts) around the incident window.",
109
+ "Inspect recent configuration or secrets changes.",
110
+ ]
111
+ if label == "oom" or "oom" in result.signatures:
112
+ checks += [
113
+ "Inspect container memory limits/requests and current usage.",
114
+ "Review heap/thread dumps if available.",
115
+ "Check for memory leaks or unbounded caches.",
116
+ "Ensure JVM/Runtime memory flags are configured correctly.",
117
+ ]
118
+ if label in ("timeout",):
119
+ checks += [
120
+ "Measure latency between service and dependencies.",
121
+ "Verify retry/backoff settings and circuit breakers.",
122
+ "Check for slow queries or downstream saturation.",
123
+ ]
124
+ if label in ("auth_failure",):
125
+ checks += [
126
+ "Verify tokens/credentials validity and scopes.",
127
+ "Check clock skew between services.",
128
+ "Review authentication provider health and rate limits.",
129
+ ]
130
+ if label in ("db_connection",):
131
+ checks += [
132
+ "Check DB connection pool size vs load.",
133
+ "Inspect database for locks or slow queries.",
134
+ "Verify database host/port/DNS correctness.",
135
+ ]
136
+ if label in ("dns_resolution",):
137
+ checks += [
138
+ "Resolve target host from pod/host manually.",
139
+ "Check DNS server health and recent DNS changes.",
140
+ "Verify search domains and /etc/resolv.conf inside pod/container.",
141
+ ]
142
+ if label in ("tls_handshake",):
143
+ checks += [
144
+ "Validate certificates (expiry, SANs, chain).",
145
+ "Check protocol/cipher compatibility between client and server.",
146
+ "Inspect ALPN/SNI configuration.",
147
+ ]
148
+ if label in ("crashloop",):
149
+ checks += [
150
+ "Inspect startup probes/health checks and command overrides.",
151
+ "Review last logs before restart for root cause.",
152
+ "Confirm config/secret mounts exist and permissions are correct.",
153
+ ]
154
+ if retrieved:
155
+ checks.append(f"Consult runbook: {retrieved[0]['title']} (score {retrieved[0]['score']:.2f}).")
156
+ # Ensure at least 5 checks
157
+ while len(checks) < 5:
158
+ checks.append("Add extra diagnostic step: capture more logs and metrics.")
159
+ return cause, checks[:10]
160
+
161
+ def verify_hypotheses(self, premise: str, hypotheses: List[str]) -> List[Dict]:
162
+ results = []
163
+ for hyp in hypotheses:
164
+ pred = self.models.nli({"text": premise, "text_pair": hyp})[0]
165
+ results.append({"hypothesis": hyp, "label": pred["label"], "score": float(pred["score"])})
166
+ return results
167
+
168
+ def process(
169
+ self,
170
+ raw_text: str,
171
+ source: str = "auto",
172
+ use_retrieval: bool = True,
173
+ use_nli: bool = False,
174
+ verbosity: int = 1,
175
+ ) -> IncidentResult:
176
+ if not raw_text or not raw_text.strip():
177
+ raise ValueError("Logs input is empty. Please provide logs or stacktrace text.")
178
+ pre = preprocess_logs(raw_text)
179
+ cls = self.classify(pre.cleaned_text, source)
180
+ explanation = self.explain(pre.cleaned_text, verbosity=verbosity)
181
+ retrieved = self.retriever.search(pre.cleaned_text, top_k=3) if use_retrieval else []
182
+ cause, checks = self.generate_cause_and_checks(pre, cls["label"], retrieved)
183
+ verification = []
184
+ if use_nli:
185
+ hypotheses = [cause] + [f"Runbook match: {r['title']}" for r in retrieved]
186
+ verification = self.verify_hypotheses(pre.cleaned_text, hypotheses)
187
+ return IncidentResult(
188
+ incident_label=cls["label"],
189
+ incident_score=cls["score"],
190
+ incident_alternatives=cls["alternatives"],
191
+ explanation=explanation,
192
+ likely_cause=cause,
193
+ checks=checks,
194
+ retrieved=retrieved,
195
+ verification=verification,
196
+ signatures=pre.signatures,
197
+ )
198
+
199
+
200
+ def serialize_result(result: IncidentResult) -> str:
201
+ return json.dumps(asdict(result), indent=2, ensure_ascii=False)
preprocess.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from dataclasses import dataclass
3
+ from typing import List, Tuple
4
+
5
+
6
+ UUID_RE = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\b")
7
+ IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
8
+ EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
9
+ PATH_RE = re.compile(r"(?:[A-Za-z]:)?(?:/|\\)[\w\-/\\\.]+")
10
+ TIMESTAMP_RE = re.compile(r"\b\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:\.\d+)?\b")
11
+
12
+
13
+ @dataclass
14
+ class PreprocessResult:
15
+ cleaned_text: str
16
+ signatures: List[str]
17
+ masked: List[str]
18
+
19
+
20
+ def detect_signatures(text: str) -> List[str]:
21
+ signatures = []
22
+ if re.search(r"Traceback|Exception|Error:|Caused by:", text, re.IGNORECASE):
23
+ signatures.append("stacktrace")
24
+ if TIMESTAMP_RE.search(text):
25
+ signatures.append("timestamps")
26
+ if re.search(r"\bINFO\b|\bWARN\b|\bERROR\b|\bDEBUG\b|\bTRACE\b", text):
27
+ signatures.append("log_levels")
28
+ if re.search(r"CrashLoopBackOff|OOMKilled|Back-off restarting", text, re.IGNORECASE):
29
+ signatures.append("k8s")
30
+ if re.search(r"OutOfMemoryError|Java heap space", text, re.IGNORECASE):
31
+ signatures.append("oom")
32
+ if re.search(r"timeout|timed out|Connection timed out", text, re.IGNORECASE):
33
+ signatures.append("timeout")
34
+ return signatures
35
+
36
+
37
+ def mask_sensitive(text: str) -> Tuple[str, List[str]]:
38
+ masked = []
39
+
40
+ def _mask(pattern: re.Pattern, placeholder: str, value: str) -> str:
41
+ matches = pattern.findall(value)
42
+ if matches:
43
+ masked.extend(f"{placeholder}:{m}" for m in matches)
44
+ return pattern.sub(placeholder, value)
45
+
46
+ text = _mask(UUID_RE, "<UUID>", text)
47
+ text = _mask(IP_RE, "<IP>", text)
48
+ text = _mask(EMAIL_RE, "<EMAIL>", text)
49
+ text = _mask(PATH_RE, "<PATH>", text)
50
+ return text, masked
51
+
52
+
53
+ def truncate_logs(text: str, head_lines: int = 120, tail_lines: int = 80, max_lines: int = 400) -> str:
54
+ lines = text.splitlines()
55
+ if len(lines) <= max_lines:
56
+ return text
57
+ head = "\n".join(lines[:head_lines])
58
+ tail = "\n".join(lines[-tail_lines:])
59
+ return head + "\n...\n" + tail
60
+
61
+
62
+ def preprocess_logs(raw_text: str) -> PreprocessResult:
63
+ normalized = raw_text.strip()
64
+ truncated = truncate_logs(normalized)
65
+ masked_text, masked = mask_sensitive(truncated)
66
+ signatures = detect_signatures(masked_text)
67
+ return PreprocessResult(cleaned_text=masked_text, signatures=signatures, masked=list(masked))
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ transformers==4.38.2
3
+ torch>=2.2.0,<3.0
4
+ sentence-transformers==2.5.1
5
+ numpy>=1.26.0,<2.0.0
retrieval.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass
3
+ from typing import List, Optional
4
+
5
+ import numpy as np
6
+ import torch
7
+ from sentence_transformers import CrossEncoder, SentenceTransformer, util
8
+
9
+
10
+ @dataclass
11
+ class RunbookDoc:
12
+ path: str
13
+ title: str
14
+ content: str
15
+
16
+
17
+ class RunbookRetriever:
18
+ def __init__(
19
+ self,
20
+ kb_dir: str = "kb",
21
+ embed_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
22
+ reranker_name: Optional[str] = "cross-encoder/ms-marco-MiniLM-L-6-v2",
23
+ ):
24
+ self.kb_dir = kb_dir
25
+ # Force CPU to avoid CUDA capability mismatches in WSL/GPUs.
26
+ self.device = torch.device("cpu")
27
+ self.embed_model = SentenceTransformer(embed_model_name, device=self.device)
28
+ self.reranker: Optional[CrossEncoder] = None
29
+ if reranker_name:
30
+ try:
31
+ self.reranker = CrossEncoder(reranker_name, device=self.device)
32
+ except Exception:
33
+ self.reranker = None
34
+ self.docs = self._load_docs()
35
+ if self.docs:
36
+ self.doc_embeddings = self.embed_model.encode(
37
+ [doc.content for doc in self.docs],
38
+ convert_to_tensor=True,
39
+ device=self.device,
40
+ )
41
+ else:
42
+ self.doc_embeddings = None
43
+
44
+ def _load_docs(self) -> List[RunbookDoc]:
45
+ docs: List[RunbookDoc] = []
46
+ if not os.path.isdir(self.kb_dir):
47
+ return docs
48
+ for fname in os.listdir(self.kb_dir):
49
+ if not fname.endswith(".md"):
50
+ continue
51
+ path = os.path.join(self.kb_dir, fname)
52
+ with open(path, "r", encoding="utf-8") as f:
53
+ content = f.read()
54
+ title = content.splitlines()[0].lstrip("# ").strip() if content else fname
55
+ docs.append(RunbookDoc(path=path, title=title, content=content))
56
+ return docs
57
+
58
+ def search(self, query: str, top_k: int = 3):
59
+ if not self.docs or self.doc_embeddings is None:
60
+ return []
61
+ query_emb = self.embed_model.encode(query, convert_to_tensor=True, device=self.device)
62
+ scores = util.cos_sim(query_emb, self.doc_embeddings)[0]
63
+ top_results = np.argsort(-scores.cpu().numpy())[: top_k * 4]
64
+ candidates = [
65
+ {"doc": self.docs[idx], "score": float(scores[idx])} for idx in top_results
66
+ ]
67
+ if self.reranker:
68
+ pairs = [[query, c["doc"].content] for c in candidates]
69
+ rerank_scores = self.reranker.predict(pairs)
70
+ for cand, rscore in zip(candidates, rerank_scores):
71
+ cand["rerank_score"] = float(rscore)
72
+ candidates = sorted(candidates, key=lambda x: x.get("rerank_score", x["score"]), reverse=True)
73
+ else:
74
+ candidates = sorted(candidates, key=lambda x: x["score"], reverse=True)
75
+ return [
76
+ {
77
+ "title": cand["doc"].title,
78
+ "score": cand.get("rerank_score", cand["score"]),
79
+ "path": cand["doc"].path,
80
+ "excerpt": cand["doc"].content[:500],
81
+ }
82
+ for cand in candidates[:top_k]
83
+ ]
samples/sample_java.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-04 08:15:10,042 ERROR c.example.auth.AuthFilter - Failed to authorize request
2
+ java.lang.NullPointerException: Cannot invoke "String.length()" because "token" is null
3
+ at com.example.auth.AuthFilter.validate(AuthFilter.java:64)
4
+ at com.example.auth.AuthFilter.doFilter(AuthFilter.java:45)
5
+ at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
6
+ Caused by: java.lang.IllegalStateException: clock skew too high
7
+ at com.example.auth.TokenVerifier.verify(TokenVerifier.java:32)
8
+
9
+ User received 403 on /api/orders
samples/sample_k8s.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-04T09:05:11Z kubelet Warning BackOff Back-off restarting failed container
2
+ 2024-11-04T09:05:11Z kubelet Normal Pulled Successfully pulled image "myapp:v2"
3
+ 2024-11-04T09:05:12Z kubelet Warning BackOff Back-off restarting failed container
4
+
5
+ kubectl describe pod myapp-6c8f6f4b4f-nlg8v:
6
+ Last State: Terminated
7
+ Reason: Error
8
+ Exit Code: 137
9
+ Started: Mon, 04 Nov 2024 09:05:05 +0000
10
+ Finished: Mon, 04 Nov 2024 09:05:10 +0000
11
+ Events:
12
+ Warning BackOff 2m (x6 over 3m) kubelet Back-off restarting failed container
13
+ Memory:
14
+ Limits: 128Mi
15
+ Usage: 190Mi
samples/sample_python.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-04 10:22:18,012 ERROR service.worker failed processing job abc123
2
+ Traceback (most recent call last):
3
+ File "/app/service/worker.py", line 42, in handle
4
+ process(payload)
5
+ File "/app/service/handler.py", line 88, in process
6
+ resp = requests.post(url, json=data, timeout=2)
7
+ File "/usr/local/lib/python3.10/site-packages/requests/api.py", line 116, in post
8
+ return request('post', url, data=data, json=json, **kwargs)
9
+ File "/usr/local/lib/python3.10/site-packages/requests/api.py", line 60, in request
10
+ return session.request(method=method, url=url, **kwargs)
11
+ requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='api.internal', port=443): Read timed out. (read timeout=2)
12
+
13
+ Retrying with backoff...
14
+ 2024-11-04 10:22:20,013 WARN service.worker retry attempt 1 failed