SHAFI commited on
Commit Β·
cf4e4d4
1
Parent(s): a8a38df
second commit with refactored code
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- __pycache__/avro_handler.cpython-313.pyc +0 -0
- __pycache__/backend.cpython-313.pyc +0 -0
- __pycache__/gliner_model.cpython-313.pyc +0 -0
- __pycache__/inspector.cpython-313.pyc +0 -0
- __pycache__/ocr_engine.cpython-313.pyc +0 -0
- api.py +143 -0
- backend.py +173 -307
- new_spacy β classifier_manager/__init__.py +0 -0
- classifier_manager/__pycache__/__init__.cpython-313.pyc +0 -0
- classifier_manager/__pycache__/gliner_model.cpython-313.pyc +0 -0
- classifier_manager/__pycache__/inspector.cpython-313.pyc +0 -0
- classifier_manager/__pycache__/presidio_model.cpython-313.pyc +0 -0
- classifier_manager/__pycache__/regex_scanner.cpython-313.pyc +0 -0
- classifier_manager/__pycache__/spacy_model.cpython-313.pyc +0 -0
- classifier_manager/gliner_model.py +81 -0
- inspector.py β classifier_manager/inspector.py +24 -8
- presidio_model.py β classifier_manager/presidio_model.py +0 -0
- classifier_manager/regex_scanner.py +44 -0
- Spacy_model.py β classifier_manager/spacy_model.py +0 -0
- connectors/__init__.py +0 -0
- connectors/__pycache__/__init__.cpython-313.pyc +0 -0
- connectors/__pycache__/aws_s3_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/azure_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/confluence_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/drive_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/gcp_storage_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/gmail_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/mongo_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/mysql_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/postgres_handler.cpython-313.pyc +0 -0
- connectors/__pycache__/slack_handler.cpython-313.pyc +0 -0
- connectors/aws_s3_handler.py +32 -0
- connectors/azure_handler.py +32 -0
- connectors/confluence_handler.py +44 -0
- connectors/drive_handler.py +52 -0
- connectors/gcp_storage_handler.py +37 -0
- connectors/gmail_handler.py +77 -0
- connectors/mongo_handler.py +45 -0
- connectors/mysql_handler.py +23 -0
- connectors/postgres_handler.py +23 -0
- connectors/slack_handler.py +47 -0
- file_handlers/__init__.py +0 -0
- file_handlers/__pycache__/__init__.cpython-313.pyc +0 -0
- file_handlers/__pycache__/avro_handler.cpython-313.pyc +0 -0
- file_handlers/__pycache__/json_handler.cpython-313.pyc +0 -0
- file_handlers/__pycache__/ocr_engine.cpython-313.pyc +0 -0
- file_handlers/__pycache__/parquet_handler.cpython-313.pyc +0 -0
- file_handlers/__pycache__/pdf_handler.cpython-313.pyc +0 -0
- file_handlers/avro_handler.py +36 -0
- file_handlers/json_handler.py +39 -0
__pycache__/avro_handler.cpython-313.pyc
ADDED
|
Binary file (2 kB). View file
|
|
|
__pycache__/backend.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/backend.cpython-313.pyc and b/__pycache__/backend.cpython-313.pyc differ
|
|
|
__pycache__/gliner_model.cpython-313.pyc
ADDED
|
Binary file (2.88 kB). View file
|
|
|
__pycache__/inspector.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/inspector.cpython-313.pyc and b/__pycache__/inspector.cpython-313.pyc differ
|
|
|
__pycache__/ocr_engine.cpython-313.pyc
ADDED
|
Binary file (1.9 kB). View file
|
|
|
api.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api.py
|
| 2 |
+
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from typing import Optional, List
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import io
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
# Import your existing backend orchestrator
|
| 10 |
+
from core.backend import RegexClassifier
|
| 11 |
+
|
| 12 |
+
app = FastAPI(title="Segmento Sense API")
|
| 13 |
+
|
| 14 |
+
# Initialize the Brain
|
| 15 |
+
backend = RegexClassifier()
|
| 16 |
+
|
| 17 |
+
# --- Pydantic Models for Requests ---
|
| 18 |
+
class DbConnection(BaseModel):
|
| 19 |
+
type: str # postgres, mysql, mongo
|
| 20 |
+
host: str
|
| 21 |
+
port: str
|
| 22 |
+
user: str
|
| 23 |
+
password: str
|
| 24 |
+
database: str
|
| 25 |
+
collection: Optional[str] = None
|
| 26 |
+
|
| 27 |
+
class CloudConnection(BaseModel):
|
| 28 |
+
service: str # aws, azure, gcp
|
| 29 |
+
key_1: str # access_key or conn_string
|
| 30 |
+
key_2: Optional[str] = None # secret_key
|
| 31 |
+
region: Optional[str] = None
|
| 32 |
+
bucket: str
|
| 33 |
+
file_name: str
|
| 34 |
+
|
| 35 |
+
class AppConnection(BaseModel):
|
| 36 |
+
service: str # gmail, slack, confluence
|
| 37 |
+
token_or_path: str # token or credentials.json content
|
| 38 |
+
target: str # channel_id, page_id, or num_emails
|
| 39 |
+
|
| 40 |
+
# --- ENDPOINTS ---
|
| 41 |
+
|
| 42 |
+
@app.get("/")
|
| 43 |
+
def health_check():
|
| 44 |
+
return {"status": "Segmento Sense is running"}
|
| 45 |
+
|
| 46 |
+
@app.post("/scan/file")
|
| 47 |
+
async def scan_file(file: UploadFile = File(...)):
|
| 48 |
+
"""
|
| 49 |
+
Handles PDF, CSV, JSON, Parquet, Avro, Image uploads.
|
| 50 |
+
"""
|
| 51 |
+
file_bytes = await file.read()
|
| 52 |
+
filename = file.filename.lower()
|
| 53 |
+
|
| 54 |
+
df = pd.DataFrame()
|
| 55 |
+
raw_text = ""
|
| 56 |
+
|
| 57 |
+
# 1. Route to correct handler in backend.py
|
| 58 |
+
if filename.endswith(".pdf"):
|
| 59 |
+
# For demo, scan page 0
|
| 60 |
+
raw_text = backend.get_pdf_page_text(file_bytes, 0)
|
| 61 |
+
# Scan text
|
| 62 |
+
inspection = backend.run_full_inspection(raw_text)
|
| 63 |
+
matches = backend.analyze_text_hybrid(raw_text)
|
| 64 |
+
return {
|
| 65 |
+
"type": "unstructured",
|
| 66 |
+
"content": raw_text,
|
| 67 |
+
"matches": matches,
|
| 68 |
+
"stats": inspection.to_dict(orient="records")
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
elif filename.endswith((".png", ".jpg", ".jpeg")):
|
| 72 |
+
raw_text = backend.get_ocr_text_from_image(file_bytes)
|
| 73 |
+
inspection = backend.run_full_inspection(raw_text)
|
| 74 |
+
matches = backend.analyze_text_hybrid(raw_text)
|
| 75 |
+
return {
|
| 76 |
+
"type": "unstructured",
|
| 77 |
+
"content": raw_text,
|
| 78 |
+
"matches": matches,
|
| 79 |
+
"stats": inspection.to_dict(orient="records")
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
else:
|
| 83 |
+
# Structured Data
|
| 84 |
+
if filename.endswith(".csv"):
|
| 85 |
+
df = pd.read_csv(io.BytesIO(file_bytes))
|
| 86 |
+
elif filename.endswith(".json"):
|
| 87 |
+
df = backend.get_json_data(io.BytesIO(file_bytes))
|
| 88 |
+
elif filename.endswith(".parquet"):
|
| 89 |
+
df = backend.get_parquet_data(file_bytes)
|
| 90 |
+
elif filename.endswith(".avro"):
|
| 91 |
+
df = backend.get_avro_data(file_bytes)
|
| 92 |
+
|
| 93 |
+
# Get PII Counts
|
| 94 |
+
pii_counts = backend.get_pii_counts_dataframe(df)
|
| 95 |
+
masked_preview = backend.mask_dataframe(df.head(20))
|
| 96 |
+
|
| 97 |
+
return {
|
| 98 |
+
"type": "structured",
|
| 99 |
+
"pii_counts": pii_counts.to_dict(orient="records"),
|
| 100 |
+
"preview": masked_preview.to_dict(orient="records"),
|
| 101 |
+
"schema": backend.get_data_schema(df).to_dict(orient="records")
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
@app.post("/scan/database")
|
| 105 |
+
async def scan_db(conn: DbConnection):
|
| 106 |
+
df = pd.DataFrame()
|
| 107 |
+
if conn.type == "postgres":
|
| 108 |
+
df = backend.get_postgres_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
|
| 109 |
+
elif conn.type == "mysql":
|
| 110 |
+
df = backend.get_mysql_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
|
| 111 |
+
elif conn.type == "mongo":
|
| 112 |
+
df = backend.get_mongodb_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
|
| 113 |
+
|
| 114 |
+
if df.empty:
|
| 115 |
+
raise HTTPException(status_code=404, detail="Connection failed or no data found")
|
| 116 |
+
|
| 117 |
+
pii_counts = backend.get_pii_counts_dataframe(df)
|
| 118 |
+
return {
|
| 119 |
+
"source": conn.type,
|
| 120 |
+
"pii_counts": pii_counts.to_dict(orient="records"),
|
| 121 |
+
"preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records")
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
@app.post("/scan/app")
|
| 125 |
+
async def scan_app(conn: AppConnection):
|
| 126 |
+
df = pd.DataFrame()
|
| 127 |
+
|
| 128 |
+
if conn.service == "slack":
|
| 129 |
+
df = backend.get_slack_messages(conn.token_or_path, conn.target)
|
| 130 |
+
elif conn.service == "confluence":
|
| 131 |
+
# Split target "url|user|page_id" if needed or adjust model
|
| 132 |
+
# Simplified for demo: assuming backend handles auth
|
| 133 |
+
pass
|
| 134 |
+
|
| 135 |
+
if df.empty:
|
| 136 |
+
raise HTTPException(status_code=400, detail="No data fetched")
|
| 137 |
+
|
| 138 |
+
pii_counts = backend.get_pii_counts_dataframe(df)
|
| 139 |
+
return {
|
| 140 |
+
"source": conn.service,
|
| 141 |
+
"pii_counts": pii_counts.to_dict(orient="records"),
|
| 142 |
+
"preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records")
|
| 143 |
+
}
|
backend.py
CHANGED
|
@@ -1,68 +1,66 @@
|
|
| 1 |
-
# backend.py
|
| 2 |
import re
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
import fitz # PyMuPDF
|
| 6 |
import nltk
|
| 7 |
import io
|
|
|
|
|
|
|
|
|
|
| 8 |
from typing import Dict, List, Any
|
| 9 |
from sqlalchemy import create_engine
|
| 10 |
from urllib.parse import quote_plus
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
from
|
| 15 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# --- DEPENDENCY CHECKS ---
|
| 18 |
try:
|
| 19 |
from googleapiclient.discovery import build
|
| 20 |
-
from googleapiclient.http import MediaIoBaseDownload
|
| 21 |
-
from google.oauth2 import service_account
|
| 22 |
GOOGLE_AVAILABLE = True
|
| 23 |
except ImportError:
|
| 24 |
GOOGLE_AVAILABLE = False
|
| 25 |
-
print("Google
|
| 26 |
-
|
| 27 |
try:
|
| 28 |
import pymongo
|
| 29 |
MONGO_AVAILABLE = True
|
| 30 |
-
except
|
| 31 |
-
MONGO_AVAILABLE = False
|
| 32 |
-
print("PyMongo not installed.")
|
| 33 |
-
|
| 34 |
-
try:
|
| 35 |
-
import pyarrow
|
| 36 |
-
PARQUET_AVAILABLE = True
|
| 37 |
-
except ImportError:
|
| 38 |
-
PARQUET_AVAILABLE = False
|
| 39 |
-
print("PyArrow not installed.")
|
| 40 |
-
|
| 41 |
try:
|
| 42 |
import boto3
|
| 43 |
AWS_AVAILABLE = True
|
| 44 |
-
except
|
| 45 |
-
AWS_AVAILABLE = False
|
| 46 |
-
print("Boto3 not installed.")
|
| 47 |
-
|
| 48 |
try:
|
| 49 |
from azure.storage.blob import BlobServiceClient
|
| 50 |
AZURE_AVAILABLE = True
|
| 51 |
-
except
|
| 52 |
-
AZURE_AVAILABLE = False
|
| 53 |
-
print("Azure Storage Blob not installed.")
|
| 54 |
-
|
| 55 |
-
# --- GCP STORAGE IMPORT (NEW) ---
|
| 56 |
try:
|
| 57 |
from google.cloud import storage
|
| 58 |
-
# We reuse google.oauth2.service_account if available, else import it
|
| 59 |
-
from google.oauth2 import service_account as gcp_service_account
|
| 60 |
GCS_AVAILABLE = True
|
| 61 |
-
except
|
| 62 |
-
GCS_AVAILABLE = False
|
| 63 |
-
print("Google Cloud Storage library not installed.")
|
| 64 |
|
| 65 |
-
#
|
| 66 |
try:
|
| 67 |
nltk.data.find('tokenizers/punkt')
|
| 68 |
except LookupError:
|
|
@@ -75,10 +73,9 @@ except LookupError:
|
|
| 75 |
class RegexClassifier:
|
| 76 |
def __init__(self):
|
| 77 |
self.colors = {
|
| 78 |
-
"EMAIL":
|
| 79 |
-
"PHONE":
|
| 80 |
-
"LOCATION":
|
| 81 |
-
"DEFAULT": (224, 224, 224)
|
| 82 |
}
|
| 83 |
|
| 84 |
self.patterns: Dict[str, str] = {
|
|
@@ -90,69 +87,121 @@ class RegexClassifier:
|
|
| 90 |
"PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
|
| 91 |
}
|
| 92 |
|
|
|
|
| 93 |
self.spacy_analyzer = PiiSpacyAnalyzer()
|
| 94 |
self.presidio_analyzer = PiiPresidioAnalyzer()
|
|
|
|
| 95 |
self.inspector = ModelInspector()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
def list_patterns(self): return self.patterns
|
| 98 |
def add_pattern(self, n, r): self.patterns[n.upper()] = r
|
| 99 |
def remove_pattern(self, n): self.patterns.pop(n.upper(), None)
|
| 100 |
|
| 101 |
-
# ---
|
| 102 |
def scan_with_regex(self, text: str) -> List[dict]:
|
| 103 |
matches = []
|
| 104 |
for label, regex in self.patterns.items():
|
| 105 |
-
for
|
| 106 |
-
matches.append({"label": label, "text":
|
| 107 |
return matches
|
| 108 |
|
| 109 |
def scan_with_nltk(self, text: str) -> List[dict]:
|
| 110 |
detections = []
|
| 111 |
try:
|
| 112 |
-
|
| 113 |
-
chunked = nltk.ne_chunk(nltk.pos_tag(tokens), binary=False)
|
| 114 |
-
current_pos = 0
|
| 115 |
-
for chunk in chunked:
|
| 116 |
if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'GPE']:
|
| 117 |
val = " ".join(c[0] for c in chunk)
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
| 123 |
except: pass
|
| 124 |
return detections
|
| 125 |
|
| 126 |
def analyze_text_hybrid(self, text: str) -> List[dict]:
|
|
|
|
| 127 |
all_matches = []
|
| 128 |
all_matches.extend(self.scan_with_regex(text))
|
| 129 |
all_matches.extend(self.scan_with_nltk(text))
|
| 130 |
all_matches.extend(self.spacy_analyzer.scan(text))
|
| 131 |
all_matches.extend(self.presidio_analyzer.scan(text))
|
|
|
|
| 132 |
|
| 133 |
all_matches.sort(key=lambda x: x['start'])
|
| 134 |
-
|
| 135 |
-
unique_matches = []
|
| 136 |
if not all_matches: return []
|
| 137 |
curr = all_matches[0]
|
| 138 |
-
for
|
| 139 |
-
if
|
| 140 |
-
if len(
|
| 141 |
-
curr =
|
| 142 |
else:
|
| 143 |
-
|
| 144 |
-
curr =
|
| 145 |
-
|
| 146 |
-
return
|
| 147 |
-
|
| 148 |
-
def run_full_inspection(self, text: str)
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def get_pii_counts(self, text: str) -> pd.DataFrame:
|
| 157 |
matches = self.analyze_text_hybrid(str(text))
|
| 158 |
if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
|
|
@@ -160,261 +209,78 @@ class RegexClassifier:
|
|
| 160 |
for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
|
| 161 |
return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
|
| 162 |
|
| 163 |
-
def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 164 |
-
full_text = " ".join(df.astype(str).values.flatten())
|
| 165 |
-
return self.get_pii_counts(full_text)
|
| 166 |
-
|
| 167 |
-
def mask_pii(self, text: str) -> str:
|
| 168 |
-
text = str(text)
|
| 169 |
-
matches = self.analyze_text_hybrid(text)
|
| 170 |
-
matches.sort(key=lambda x: x['start'], reverse=True)
|
| 171 |
-
for m in matches:
|
| 172 |
-
masked_val = "******"
|
| 173 |
-
if "<span" not in text[m['start']:m['end']]:
|
| 174 |
-
text = text[:m['start']] + masked_val + text[m['end']:]
|
| 175 |
-
return text
|
| 176 |
-
|
| 177 |
def mask_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 178 |
-
def
|
| 179 |
-
|
| 180 |
-
if pd.isna(val): return val
|
| 181 |
-
return self.mask_pii(str(val))
|
| 182 |
-
return df.map(safe_mask)
|
| 183 |
-
|
| 184 |
-
def get_labeled_pdf_image(self, file_bytes, page_num: int):
|
| 185 |
-
try:
|
| 186 |
-
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 187 |
-
if not (0 <= page_num < len(doc)): return None
|
| 188 |
-
page = doc[page_num]
|
| 189 |
-
text = page.get_text("text")
|
| 190 |
matches = self.analyze_text_hybrid(text)
|
|
|
|
| 191 |
for m in matches:
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
page.insert_text(fitz.Point(quad.x0, quad.y0-2), m['label'], fontsize=6, color=(0,0,0))
|
| 197 |
-
return page.get_pixmap(matrix=fitz.Matrix(2, 2)).tobytes("png")
|
| 198 |
-
except: return None
|
| 199 |
|
| 200 |
def scan_dataframe_with_html(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 201 |
-
def
|
| 202 |
text = str(text)
|
| 203 |
matches = self.analyze_text_hybrid(text)
|
| 204 |
matches.sort(key=lambda x: x['start'], reverse=True)
|
| 205 |
-
hex_map = {"EMAIL": "#8ef", "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea", "FIRST_NAME": "#af9", "LAST_NAME": "#af9", "LOCATION": "#dcf", "AADHAAR_IND": "#f9f", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"}
|
| 206 |
for m in matches:
|
| 207 |
if "<span" in text[m['start']:m['end']]: continue
|
| 208 |
-
color =
|
| 209 |
-
|
| 210 |
-
text = text[:m['start']] +
|
| 211 |
return text
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
def get_data_schema(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 219 |
-
if df.empty: return pd.DataFrame(columns=["Column", "Type", "Sample"])
|
| 220 |
-
schema_info = []
|
| 221 |
-
for col in df.columns:
|
| 222 |
-
d_type = str(df[col].dtype)
|
| 223 |
-
first_valid_idx = df[col].first_valid_index()
|
| 224 |
-
sample_val = str(df[col].loc[first_valid_idx]) if first_valid_idx is not None else "All Null"
|
| 225 |
-
if len(sample_val) > 50: sample_val = sample_val[:47] + "..."
|
| 226 |
-
schema_info.append({"Column Name": col, "Data Type": d_type, "Sample Value": sample_val})
|
| 227 |
-
return pd.DataFrame(schema_info)
|
| 228 |
-
|
| 229 |
-
# --- SQL/MONGO/DRIVE/S3/AZURE CONNECTORS ---
|
| 230 |
def get_postgres_data(self, host, port, db, user, pw, table):
|
| 231 |
-
|
| 232 |
-
conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}"
|
| 233 |
-
engine = create_engine(conn_str)
|
| 234 |
-
return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine)
|
| 235 |
|
| 236 |
def get_mysql_data(self, host, port, db, user, pw, table):
|
| 237 |
-
|
| 238 |
-
conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}"
|
| 239 |
-
engine = create_engine(conn_str)
|
| 240 |
-
return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine)
|
| 241 |
|
| 242 |
-
def
|
| 243 |
-
|
| 244 |
-
try:
|
| 245 |
-
if user and pw:
|
| 246 |
-
safe_user = quote_plus(user)
|
| 247 |
-
safe_pw = quote_plus(pw)
|
| 248 |
-
uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/"
|
| 249 |
-
else:
|
| 250 |
-
uri = f"mongodb://{host}:{port}/"
|
| 251 |
-
client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
|
| 252 |
-
database = client[db]
|
| 253 |
-
col = database[collection]
|
| 254 |
-
cursor = col.find().limit(100)
|
| 255 |
-
data_list = list(cursor)
|
| 256 |
-
if not data_list: return pd.DataFrame()
|
| 257 |
-
for doc in data_list:
|
| 258 |
-
if '_id' in doc: doc['_id'] = str(doc['_id'])
|
| 259 |
-
return pd.json_normalize(data_list)
|
| 260 |
-
except Exception as e:
|
| 261 |
-
print(f"Mongo Error: {e}")
|
| 262 |
-
raise e
|
| 263 |
|
| 264 |
def get_google_drive_files(self, credentials_dict):
|
| 265 |
-
|
| 266 |
-
try:
|
| 267 |
-
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
| 268 |
-
creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES)
|
| 269 |
-
service = build('drive', 'v3', credentials=creds)
|
| 270 |
-
return service.files().list(pageSize=15, fields="files(id, name, mimeType)").execute().get('files', [])
|
| 271 |
-
except Exception as e:
|
| 272 |
-
return []
|
| 273 |
|
| 274 |
def download_drive_file(self, file_id, mime_type, credentials_dict):
|
| 275 |
-
|
| 276 |
-
try:
|
| 277 |
-
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
| 278 |
-
creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES)
|
| 279 |
-
service = build('drive', 'v3', credentials=creds)
|
| 280 |
-
if "spreadsheet" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='text/csv')
|
| 281 |
-
elif "document" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
|
| 282 |
-
elif "presentation" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
|
| 283 |
-
else: request = service.files().get_media(fileId=file_id)
|
| 284 |
-
fh = io.BytesIO()
|
| 285 |
-
downloader = MediaIoBaseDownload(fh, request)
|
| 286 |
-
done = False
|
| 287 |
-
while done is False: status, done = downloader.next_chunk()
|
| 288 |
-
return fh.getvalue()
|
| 289 |
-
except: return b""
|
| 290 |
-
|
| 291 |
-
def get_s3_buckets(self, access_key, secret_key, region):
|
| 292 |
-
if not AWS_AVAILABLE: return []
|
| 293 |
-
try:
|
| 294 |
-
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
|
| 295 |
-
response = s3.list_buckets()
|
| 296 |
-
return [b['Name'] for b in response.get('Buckets', [])]
|
| 297 |
-
except Exception as e:
|
| 298 |
-
print(f"S3 Error: {e}")
|
| 299 |
-
return []
|
| 300 |
-
|
| 301 |
-
def get_s3_files(self, access_key, secret_key, region, bucket_name):
|
| 302 |
-
if not AWS_AVAILABLE: return []
|
| 303 |
-
try:
|
| 304 |
-
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
|
| 305 |
-
response = s3.list_objects_v2(Bucket=bucket_name)
|
| 306 |
-
return [obj['Key'] for obj in response.get('Contents', [])]
|
| 307 |
-
except Exception as e:
|
| 308 |
-
return []
|
| 309 |
-
|
| 310 |
-
def download_s3_file(self, access_key, secret_key, region, bucket_name, file_key):
|
| 311 |
-
if not AWS_AVAILABLE: return b""
|
| 312 |
-
try:
|
| 313 |
-
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
|
| 314 |
-
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
|
| 315 |
-
return obj['Body'].read()
|
| 316 |
-
except Exception as e:
|
| 317 |
-
return b""
|
| 318 |
-
|
| 319 |
-
def get_azure_containers(self, conn_str):
|
| 320 |
-
if not AZURE_AVAILABLE: return []
|
| 321 |
-
try:
|
| 322 |
-
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
|
| 323 |
-
containers = blob_service_client.list_containers()
|
| 324 |
-
return [c['name'] for c in containers]
|
| 325 |
-
except Exception as e:
|
| 326 |
-
print(f"Azure Error: {e}")
|
| 327 |
-
return []
|
| 328 |
-
|
| 329 |
-
def get_azure_blobs(self, conn_str, container_name):
|
| 330 |
-
if not AZURE_AVAILABLE: return []
|
| 331 |
-
try:
|
| 332 |
-
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
|
| 333 |
-
container_client = blob_service_client.get_container_client(container_name)
|
| 334 |
-
blobs = container_client.list_blobs()
|
| 335 |
-
return [b['name'] for b in blobs]
|
| 336 |
-
except Exception as e:
|
| 337 |
-
return []
|
| 338 |
-
|
| 339 |
-
def download_azure_blob(self, conn_str, container_name, blob_name):
|
| 340 |
-
if not AZURE_AVAILABLE: return b""
|
| 341 |
-
try:
|
| 342 |
-
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
|
| 343 |
-
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
|
| 344 |
-
return blob_client.download_blob().readall()
|
| 345 |
-
except Exception as e:
|
| 346 |
-
return b""
|
| 347 |
-
|
| 348 |
-
# --- GCP BUCKET CONNECTORS (NEW) ---
|
| 349 |
-
def get_gcs_buckets(self, credentials_dict):
|
| 350 |
-
"""Lists all GCS buckets for the given service account credentials."""
|
| 351 |
-
if not GCS_AVAILABLE: return []
|
| 352 |
-
try:
|
| 353 |
-
# Create credentials object
|
| 354 |
-
credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
|
| 355 |
-
# Create storage client
|
| 356 |
-
storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
|
| 357 |
-
|
| 358 |
-
buckets = storage_client.list_buckets()
|
| 359 |
-
return [bucket.name for bucket in buckets]
|
| 360 |
-
except Exception as e:
|
| 361 |
-
print(f"GCP Bucket Error: {e}")
|
| 362 |
-
return []
|
| 363 |
-
|
| 364 |
-
def get_gcs_files(self, credentials_dict, bucket_name):
|
| 365 |
-
"""Lists files (blobs) in a specific GCS bucket."""
|
| 366 |
-
if not GCS_AVAILABLE: return []
|
| 367 |
-
try:
|
| 368 |
-
credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
|
| 369 |
-
storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
|
| 370 |
-
|
| 371 |
-
blobs = storage_client.list_blobs(bucket_name)
|
| 372 |
-
return [blob.name for blob in blobs]
|
| 373 |
-
except Exception as e:
|
| 374 |
-
print(f"GCP List Error: {e}")
|
| 375 |
-
return []
|
| 376 |
-
|
| 377 |
-
def download_gcs_file(self, credentials_dict, bucket_name, blob_name):
|
| 378 |
-
"""Downloads a blob from GCS to memory."""
|
| 379 |
-
if not GCS_AVAILABLE: return b""
|
| 380 |
-
try:
|
| 381 |
-
credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
|
| 382 |
-
storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
|
| 383 |
-
|
| 384 |
-
bucket = storage_client.bucket(bucket_name)
|
| 385 |
-
blob = bucket.blob(blob_name)
|
| 386 |
-
return blob.download_as_bytes()
|
| 387 |
-
except Exception as e:
|
| 388 |
-
print(f"GCP Download Error: {e}")
|
| 389 |
-
return b""
|
| 390 |
-
|
| 391 |
-
# --- FILE READERS ---
|
| 392 |
-
def get_json_data(self, file_obj) -> pd.DataFrame:
|
| 393 |
-
data = json.load(file_obj)
|
| 394 |
-
flat = []
|
| 395 |
-
def recursive(d, path):
|
| 396 |
-
if isinstance(d, dict):
|
| 397 |
-
for k, v in d.items(): recursive(v, f"{path}.{k}" if path else k)
|
| 398 |
-
elif isinstance(d, list):
|
| 399 |
-
for i, v in enumerate(d): recursive(v, f"{path}[{i}]")
|
| 400 |
-
else: flat.append({"Path": path, "Value": str(d)})
|
| 401 |
-
recursive(data, "")
|
| 402 |
-
return pd.DataFrame(flat)
|
| 403 |
|
| 404 |
-
def
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
return pd.read_parquet(io.BytesIO(file_bytes))
|
| 408 |
-
except: return pd.DataFrame()
|
| 409 |
-
|
| 410 |
-
def get_pdf_total_pages(self, file_bytes) -> int:
|
| 411 |
-
try:
|
| 412 |
-
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 413 |
-
return len(doc)
|
| 414 |
-
except: return 0
|
| 415 |
|
| 416 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
try:
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import json
|
| 3 |
import pandas as pd
|
| 4 |
import fitz # PyMuPDF
|
| 5 |
import nltk
|
| 6 |
import io
|
| 7 |
+
import os
|
| 8 |
+
import pickle
|
| 9 |
+
import base64
|
| 10 |
from typing import Dict, List, Any
|
| 11 |
from sqlalchemy import create_engine
|
| 12 |
from urllib.parse import quote_plus
|
| 13 |
+
from bs4 import BeautifulSoup
|
| 14 |
+
|
| 15 |
+
# --- IMPORT CLASSIFIERS ---
|
| 16 |
+
from classifier_manager.spacy_model import PiiSpacyAnalyzer
|
| 17 |
+
from classifier_manager.presidio_model import PiiPresidioAnalyzer
|
| 18 |
+
from classifier_manager.gliner_model import PiiGlinerAnalyzer
|
| 19 |
+
from classifier_manager.inspector import ModelInspector
|
| 20 |
+
|
| 21 |
+
# --- IMPORT FILE HANDLERS ---
|
| 22 |
+
from file_handlers.ocr_engine import OcrEngine
|
| 23 |
+
from file_handlers.avro_handler import AvroHandler
|
| 24 |
+
from file_handlers.parquet_handler import ParquetHandler
|
| 25 |
+
from file_handlers.json_handler import JsonHandler
|
| 26 |
+
from file_handlers.pdf_handler import PdfHandler
|
| 27 |
+
|
| 28 |
+
# --- IMPORT CONNECTORS ---
|
| 29 |
+
from connectors.postgres_handler import PostgresHandler
|
| 30 |
+
from connectors.mysql_handler import MysqlHandler
|
| 31 |
+
from connectors.gmail_handler import GmailHandler
|
| 32 |
+
from connectors.drive_handler import DriveHandler
|
| 33 |
+
from connectors.aws_s3_handler import S3Handler
|
| 34 |
+
from connectors.azure_handler import AzureBlobHandler
|
| 35 |
+
from connectors.gcp_storage_handler import GcpStorageHandler
|
| 36 |
+
from connectors.slack_handler import SlackHandler # <--- NEW
|
| 37 |
+
from connectors.confluence_handler import ConfluenceHandler # <--- NEW
|
| 38 |
|
| 39 |
# --- DEPENDENCY CHECKS ---
|
| 40 |
try:
|
| 41 |
from googleapiclient.discovery import build
|
|
|
|
|
|
|
| 42 |
GOOGLE_AVAILABLE = True
|
| 43 |
except ImportError:
|
| 44 |
GOOGLE_AVAILABLE = False
|
| 45 |
+
print("Google Libraries not installed.")
|
|
|
|
| 46 |
try:
|
| 47 |
import pymongo
|
| 48 |
MONGO_AVAILABLE = True
|
| 49 |
+
except: MONGO_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
try:
|
| 51 |
import boto3
|
| 52 |
AWS_AVAILABLE = True
|
| 53 |
+
except: AWS_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
| 54 |
try:
|
| 55 |
from azure.storage.blob import BlobServiceClient
|
| 56 |
AZURE_AVAILABLE = True
|
| 57 |
+
except: AZURE_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
try:
|
| 59 |
from google.cloud import storage
|
|
|
|
|
|
|
| 60 |
GCS_AVAILABLE = True
|
| 61 |
+
except: GCS_AVAILABLE = False
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
# NLTK Setup
|
| 64 |
try:
|
| 65 |
nltk.data.find('tokenizers/punkt')
|
| 66 |
except LookupError:
|
|
|
|
| 73 |
class RegexClassifier:
|
| 74 |
def __init__(self):
|
| 75 |
self.colors = {
|
| 76 |
+
"EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9",
|
| 77 |
+
"PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea",
|
| 78 |
+
"LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"
|
|
|
|
| 79 |
}
|
| 80 |
|
| 81 |
self.patterns: Dict[str, str] = {
|
|
|
|
| 87 |
"PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
|
| 88 |
}
|
| 89 |
|
| 90 |
+
# 1. Classifiers
|
| 91 |
self.spacy_analyzer = PiiSpacyAnalyzer()
|
| 92 |
self.presidio_analyzer = PiiPresidioAnalyzer()
|
| 93 |
+
self.gliner_analyzer = PiiGlinerAnalyzer()
|
| 94 |
self.inspector = ModelInspector()
|
| 95 |
+
|
| 96 |
+
# 2. File Handlers
|
| 97 |
+
self.ocr_engine = OcrEngine()
|
| 98 |
+
self.avro_handler = AvroHandler()
|
| 99 |
+
self.parquet_handler = ParquetHandler()
|
| 100 |
+
self.json_handler = JsonHandler()
|
| 101 |
+
self.pdf_handler = PdfHandler(self.ocr_engine)
|
| 102 |
+
|
| 103 |
+
# 3. Connectors
|
| 104 |
+
self.pg_handler = PostgresHandler()
|
| 105 |
+
self.mysql_handler = MysqlHandler()
|
| 106 |
+
self.gmail_handler = GmailHandler()
|
| 107 |
+
self.drive_handler = DriveHandler()
|
| 108 |
+
self.s3_handler = S3Handler()
|
| 109 |
+
self.azure_handler = AzureBlobHandler()
|
| 110 |
+
self.gcp_handler = GcpStorageHandler()
|
| 111 |
+
self.slack_handler = SlackHandler() # <--- Init
|
| 112 |
+
self.confluence_handler = ConfluenceHandler() # <--- Init
|
| 113 |
|
| 114 |
def list_patterns(self): return self.patterns
|
| 115 |
def add_pattern(self, n, r): self.patterns[n.upper()] = r
|
| 116 |
def remove_pattern(self, n): self.patterns.pop(n.upper(), None)
|
| 117 |
|
| 118 |
+
# --- CORE ANALYSIS ---
|
| 119 |
def scan_with_regex(self, text: str) -> List[dict]:
|
| 120 |
matches = []
|
| 121 |
for label, regex in self.patterns.items():
|
| 122 |
+
for m in re.finditer(regex, text):
|
| 123 |
+
matches.append({"label": label, "text": m.group(), "start": m.start(), "end": m.end(), "source": "Regex"})
|
| 124 |
return matches
|
| 125 |
|
| 126 |
def scan_with_nltk(self, text: str) -> List[dict]:
|
| 127 |
detections = []
|
| 128 |
try:
|
| 129 |
+
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
|
|
|
|
|
|
|
|
|
|
| 130 |
if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'GPE']:
|
| 131 |
val = " ".join(c[0] for c in chunk)
|
| 132 |
+
start = text.find(val)
|
| 133 |
+
if start != -1:
|
| 134 |
+
detections.append({
|
| 135 |
+
"label": "LOCATION" if chunk.label() == 'GPE' else "FIRST_NAME",
|
| 136 |
+
"text": val, "start": start, "end": start+len(val), "source": "NLTK"
|
| 137 |
+
})
|
| 138 |
except: pass
|
| 139 |
return detections
|
| 140 |
|
| 141 |
def analyze_text_hybrid(self, text: str) -> List[dict]:
|
| 142 |
+
if not text: return []
|
| 143 |
all_matches = []
|
| 144 |
all_matches.extend(self.scan_with_regex(text))
|
| 145 |
all_matches.extend(self.scan_with_nltk(text))
|
| 146 |
all_matches.extend(self.spacy_analyzer.scan(text))
|
| 147 |
all_matches.extend(self.presidio_analyzer.scan(text))
|
| 148 |
+
all_matches.extend(self.gliner_analyzer.scan(text))
|
| 149 |
|
| 150 |
all_matches.sort(key=lambda x: x['start'])
|
| 151 |
+
unique = []
|
|
|
|
| 152 |
if not all_matches: return []
|
| 153 |
curr = all_matches[0]
|
| 154 |
+
for next_m in all_matches[1:]:
|
| 155 |
+
if next_m['start'] < curr['end']:
|
| 156 |
+
if len(next_m['text']) > len(curr['text']):
|
| 157 |
+
curr = next_m
|
| 158 |
else:
|
| 159 |
+
unique.append(curr)
|
| 160 |
+
curr = next_m
|
| 161 |
+
unique.append(curr)
|
| 162 |
+
return unique
|
| 163 |
+
|
| 164 |
+
def run_full_inspection(self, text: str):
|
| 165 |
+
return self.inspector.compare_models(
|
| 166 |
+
self.scan_with_regex(text),
|
| 167 |
+
self.scan_with_nltk(text),
|
| 168 |
+
self.spacy_analyzer.scan(text),
|
| 169 |
+
self.presidio_analyzer.scan(text),
|
| 170 |
+
self.gliner_analyzer.scan(text)
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
# --- WRAPPERS FOR UI ---
|
| 174 |
+
def get_json_data(self, file_obj) -> pd.DataFrame:
|
| 175 |
+
return self.json_handler.read_file(file_obj)
|
| 176 |
+
|
| 177 |
+
def get_pdf_page_text(self, file_bytes, page_num):
|
| 178 |
+
return self.pdf_handler.get_page_text(file_bytes, page_num)
|
| 179 |
+
|
| 180 |
+
def get_pdf_total_pages(self, file_bytes) -> int:
|
| 181 |
+
return self.pdf_handler.get_total_pages(file_bytes)
|
| 182 |
+
|
| 183 |
+
def get_labeled_pdf_image(self, file_bytes, page_num):
|
| 184 |
+
text = self.get_pdf_page_text(file_bytes, page_num)
|
| 185 |
+
matches = self.analyze_text_hybrid(text)
|
| 186 |
+
return self.pdf_handler.render_labeled_image(file_bytes, page_num, matches, self.colors)
|
| 187 |
+
|
| 188 |
+
def get_avro_data(self, file_bytes) -> pd.DataFrame:
|
| 189 |
+
return self.avro_handler.convert_to_dataframe(file_bytes)
|
| 190 |
+
|
| 191 |
+
def get_parquet_data(self, file_bytes) -> pd.DataFrame:
|
| 192 |
+
return self.parquet_handler.convert_to_dataframe(file_bytes)
|
| 193 |
+
|
| 194 |
+
def get_ocr_text_from_image(self, file_bytes) -> str:
|
| 195 |
+
return self.ocr_engine.extract_text(file_bytes)
|
| 196 |
+
|
| 197 |
+
def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 198 |
+
text = " ".join(df.astype(str).values.flatten())
|
| 199 |
+
matches = self.analyze_text_hybrid(str(text))
|
| 200 |
+
if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
|
| 201 |
+
counts = {}
|
| 202 |
+
for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
|
| 203 |
+
return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
|
| 204 |
+
|
| 205 |
def get_pii_counts(self, text: str) -> pd.DataFrame:
|
| 206 |
matches = self.analyze_text_hybrid(str(text))
|
| 207 |
if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
|
|
|
|
| 209 |
for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
|
| 210 |
return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
def mask_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 213 |
+
def mask_text(text):
|
| 214 |
+
text = str(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
matches = self.analyze_text_hybrid(text)
|
| 216 |
+
matches.sort(key=lambda x: x['start'], reverse=True)
|
| 217 |
for m in matches:
|
| 218 |
+
if "***" not in text[m['start']:m['end']]:
|
| 219 |
+
text = text[:m['start']] + "******" + text[m['end']:]
|
| 220 |
+
return text
|
| 221 |
+
return df.map(lambda x: mask_text(x) if isinstance(x, (str, int, float)) else x)
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
def scan_dataframe_with_html(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 224 |
+
def highlight(text):
|
| 225 |
text = str(text)
|
| 226 |
matches = self.analyze_text_hybrid(text)
|
| 227 |
matches.sort(key=lambda x: x['start'], reverse=True)
|
|
|
|
| 228 |
for m in matches:
|
| 229 |
if "<span" in text[m['start']:m['end']]: continue
|
| 230 |
+
color = self.colors.get(m['label'], self.colors["DEFAULT"])
|
| 231 |
+
replacement = f'<span style="background:{color}; padding:2px; border-radius:4px;">{m["text"]}</span>'
|
| 232 |
+
text = text[:m['start']] + replacement + text[m['end']:]
|
| 233 |
return text
|
| 234 |
+
return df.map(lambda x: highlight(x) if isinstance(x, str) else x)
|
| 235 |
+
|
| 236 |
+
def get_data_schema(self, df):
|
| 237 |
+
return pd.DataFrame({"Column": df.columns, "Type": df.dtypes.astype(str)})
|
| 238 |
+
|
| 239 |
+
# --- CONNECTOR WRAPPERS ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
def get_postgres_data(self, host, port, db, user, pw, table):
|
| 241 |
+
return self.pg_handler.fetch_data(host, port, db, user, pw, table)
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
def get_mysql_data(self, host, port, db, user, pw, table):
|
| 244 |
+
return self.mysql_handler.fetch_data(host, port, db, user, pw, table)
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
+
def get_gmail_data(self, credentials_file, num_emails=10) -> pd.DataFrame:
|
| 247 |
+
return self.gmail_handler.fetch_emails(credentials_file, num_emails)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
def get_google_drive_files(self, credentials_dict):
|
| 250 |
+
return self.drive_handler.list_files(credentials_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
def download_drive_file(self, file_id, mime_type, credentials_dict):
|
| 253 |
+
return self.drive_handler.download_file(file_id, mime_type, credentials_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
+
def get_s3_buckets(self, a, s, r): return self.s3_handler.get_buckets(a, s, r)
|
| 256 |
+
def get_s3_files(self, a, s, r, b): return self.s3_handler.get_files(a, s, r, b)
|
| 257 |
+
def download_s3_file(self, a, s, r, b, k): return self.s3_handler.download_file(a, s, r, b, k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
+
def get_azure_containers(self, c): return self.azure_handler.get_containers(c)
|
| 260 |
+
def get_azure_blobs(self, c, n): return self.azure_handler.get_blobs(c, n)
|
| 261 |
+
def download_azure_blob(self, c, n, b): return self.azure_handler.download_blob(c, n, b)
|
| 262 |
+
|
| 263 |
+
def get_gcs_buckets(self, c): return self.gcp_handler.get_buckets(c)
|
| 264 |
+
def get_gcs_files(self, c, b): return self.gcp_handler.get_files(c, b)
|
| 265 |
+
def download_gcs_file(self, c, b, n): return self.gcp_handler.download_file(c, b, n)
|
| 266 |
+
|
| 267 |
+
# --- NEW WRAPPERS FOR SLACK & CONFLUENCE ---
|
| 268 |
+
def get_slack_messages(self, token, channel_id):
|
| 269 |
+
return self.slack_handler.fetch_messages(token, channel_id)
|
| 270 |
+
|
| 271 |
+
def get_confluence_page(self, url, username, token, page_id):
|
| 272 |
+
return self.confluence_handler.fetch_page_content(url, username, token, page_id)
|
| 273 |
+
|
| 274 |
+
# --- MONGO (Still here) ---
|
| 275 |
+
def get_mongodb_data(self, host, port, db, user, pw, collection):
|
| 276 |
+
if not MONGO_AVAILABLE: return pd.DataFrame()
|
| 277 |
try:
|
| 278 |
+
if user and pw: uri = f"mongodb://{quote_plus(user)}:{quote_plus(pw)}@{host}:{port}/"
|
| 279 |
+
else: uri = f"mongodb://{host}:{port}/"
|
| 280 |
+
client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
|
| 281 |
+
cursor = client[db][collection].find().limit(100)
|
| 282 |
+
data = list(cursor)
|
| 283 |
+
if not data: return pd.DataFrame()
|
| 284 |
+
for d in data: d['_id'] = str(d.get('_id', ''))
|
| 285 |
+
return pd.json_normalize(data)
|
| 286 |
+
except: return pd.DataFrame()
|
new_spacy β classifier_manager/__init__.py
RENAMED
|
File without changes
|
classifier_manager/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (206 Bytes). View file
|
|
|
classifier_manager/__pycache__/gliner_model.cpython-313.pyc
ADDED
|
Binary file (2.9 kB). View file
|
|
|
classifier_manager/__pycache__/inspector.cpython-313.pyc
ADDED
|
Binary file (3.8 kB). View file
|
|
|
classifier_manager/__pycache__/presidio_model.cpython-313.pyc
ADDED
|
Binary file (2.97 kB). View file
|
|
|
classifier_manager/__pycache__/regex_scanner.cpython-313.pyc
ADDED
|
Binary file (2.51 kB). View file
|
|
|
classifier_manager/__pycache__/spacy_model.cpython-313.pyc
ADDED
|
Binary file (2.85 kB). View file
|
|
|
classifier_manager/gliner_model.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gliner import GLiNER
|
| 2 |
+
|
| 3 |
+
class PiiGlinerAnalyzer:
|
| 4 |
+
def __init__(self, model_name="urchade/gliner_small-v2.1"):
|
| 5 |
+
"""
|
| 6 |
+
Initializes the GLiNER model.
|
| 7 |
+
Uses a small, efficient BERT-based model by default.
|
| 8 |
+
"""
|
| 9 |
+
self.model = None
|
| 10 |
+
self.available = False
|
| 11 |
+
|
| 12 |
+
# Define the natural language labels you want GLiNER to look for.
|
| 13 |
+
# These are used as prompts for the model.
|
| 14 |
+
self.labels = [
|
| 15 |
+
"person",
|
| 16 |
+
"email",
|
| 17 |
+
"phone number",
|
| 18 |
+
"credit card",
|
| 19 |
+
"social security number",
|
| 20 |
+
"organization",
|
| 21 |
+
"location",
|
| 22 |
+
"date",
|
| 23 |
+
"ip address",
|
| 24 |
+
"passport number",
|
| 25 |
+
"driver license"
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
print(f"β³ Loading GLiNER model: {model_name}...")
|
| 30 |
+
# This will download the model to your local cache on the first run
|
| 31 |
+
self.model = GLiNER.from_pretrained(model_name)
|
| 32 |
+
self.available = True
|
| 33 |
+
print("β
GLiNER model loaded successfully.")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"β Error loading GLiNER: {e}")
|
| 36 |
+
|
| 37 |
+
def scan(self, text: str) -> list:
|
| 38 |
+
"""
|
| 39 |
+
Scans text using GLiNER and normalizes the output for the Inspector.
|
| 40 |
+
"""
|
| 41 |
+
if not self.available or not text or not text.strip():
|
| 42 |
+
return []
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
# GLiNER takes text and a list of labels as input
|
| 46 |
+
# Threshold 0.5 is a good balance for the small model
|
| 47 |
+
entities = self.model.predict_entities(text, self.labels, threshold=0.5)
|
| 48 |
+
|
| 49 |
+
detections = []
|
| 50 |
+
|
| 51 |
+
# Map GLiNER's lowercase output labels to your App's standard uppercase keys
|
| 52 |
+
# to ensure consistency in the UI and Inspector.
|
| 53 |
+
label_map = {
|
| 54 |
+
"person": "FIRST_NAME",
|
| 55 |
+
"phone number": "PHONE",
|
| 56 |
+
"social security number": "SSN",
|
| 57 |
+
"organization": "ORG",
|
| 58 |
+
"location": "LOCATION",
|
| 59 |
+
"ip address": "IP_ADDRESS",
|
| 60 |
+
"credit card": "CREDIT_CARD",
|
| 61 |
+
"email": "EMAIL",
|
| 62 |
+
"date": "DATE_TIME",
|
| 63 |
+
"passport number": "PASSPORT",
|
| 64 |
+
"driver license": "DRIVER_LICENSE"
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
for ent in entities:
|
| 68 |
+
detections.append({
|
| 69 |
+
"label": label_map.get(ent["label"], ent["label"].upper().replace(" ", "_")),
|
| 70 |
+
"text": ent["text"],
|
| 71 |
+
"start": ent["start"],
|
| 72 |
+
"end": ent["end"],
|
| 73 |
+
"score": ent["score"],
|
| 74 |
+
"source": "GLiNER" # Helpful metadata
|
| 75 |
+
})
|
| 76 |
+
|
| 77 |
+
return detections
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"β οΈ GLiNER Scan Error: {e}")
|
| 81 |
+
return []
|
inspector.py β classifier_manager/inspector.py
RENAMED
|
@@ -12,16 +12,17 @@ class ModelInspector:
|
|
| 12 |
"end": match["end"]
|
| 13 |
}
|
| 14 |
|
| 15 |
-
def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches):
|
| 16 |
"""
|
| 17 |
-
Compares
|
|
|
|
| 18 |
"""
|
| 19 |
all_detections = {}
|
| 20 |
|
| 21 |
def add_to_master(matches, model_name):
|
| 22 |
found_set = set()
|
| 23 |
for m in matches:
|
| 24 |
-
# Use tuple key for uniqueness
|
| 25 |
key = (m['start'], m['end'], m['text'])
|
| 26 |
if key not in all_detections:
|
| 27 |
all_detections[key] = {'text': m['text'], 'label': m['label']}
|
|
@@ -32,19 +33,26 @@ class ModelInspector:
|
|
| 32 |
regex_set = add_to_master(regex_matches, "Regex")
|
| 33 |
nltk_set = add_to_master(nltk_matches, "NLTK")
|
| 34 |
spacy_set = add_to_master(spacy_matches, "SpaCy")
|
| 35 |
-
presidio_set = add_to_master(presidio_matches, "Presidio")
|
|
|
|
| 36 |
|
| 37 |
-
# 2. Calculate "Missed" Data
|
| 38 |
total_unique_pii = set(all_detections.keys())
|
| 39 |
|
| 40 |
regex_missed = total_unique_pii - regex_set
|
| 41 |
nltk_missed = total_unique_pii - nltk_set
|
| 42 |
spacy_missed = total_unique_pii - spacy_set
|
| 43 |
-
presidio_missed = total_unique_pii - presidio_set
|
|
|
|
| 44 |
|
| 45 |
def fmt(item_set):
|
| 46 |
items = [all_detections[k]['text'] for k in item_set]
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
total_count = len(total_unique_pii) if len(total_unique_pii) > 0 else 1
|
| 50 |
|
|
@@ -76,7 +84,15 @@ class ModelInspector:
|
|
| 76 |
"Missed PII": fmt(presidio_missed),
|
| 77 |
"Accuracy": len(presidio_set) / total_count,
|
| 78 |
"Count": len(presidio_set)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
}
|
| 80 |
]
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
| 12 |
"end": match["end"]
|
| 13 |
}
|
| 14 |
|
| 15 |
+
def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches, gliner_matches):
|
| 16 |
"""
|
| 17 |
+
Compares 5 lists of matches to find Unique vs Missed PII.
|
| 18 |
+
Added GLiNER to the comparison logic.
|
| 19 |
"""
|
| 20 |
all_detections = {}
|
| 21 |
|
| 22 |
def add_to_master(matches, model_name):
|
| 23 |
found_set = set()
|
| 24 |
for m in matches:
|
| 25 |
+
# Use tuple key for uniqueness: (start, end, text)
|
| 26 |
key = (m['start'], m['end'], m['text'])
|
| 27 |
if key not in all_detections:
|
| 28 |
all_detections[key] = {'text': m['text'], 'label': m['label']}
|
|
|
|
| 33 |
regex_set = add_to_master(regex_matches, "Regex")
|
| 34 |
nltk_set = add_to_master(nltk_matches, "NLTK")
|
| 35 |
spacy_set = add_to_master(spacy_matches, "SpaCy")
|
| 36 |
+
presidio_set = add_to_master(presidio_matches, "Presidio")
|
| 37 |
+
gliner_set = add_to_master(gliner_matches, "GLiNER") # <--- Added GLiNER
|
| 38 |
|
| 39 |
+
# 2. Calculate "Missed" Data (Union of all models)
|
| 40 |
total_unique_pii = set(all_detections.keys())
|
| 41 |
|
| 42 |
regex_missed = total_unique_pii - regex_set
|
| 43 |
nltk_missed = total_unique_pii - nltk_set
|
| 44 |
spacy_missed = total_unique_pii - spacy_set
|
| 45 |
+
presidio_missed = total_unique_pii - presidio_set
|
| 46 |
+
gliner_missed = total_unique_pii - gliner_set # <--- Added GLiNER
|
| 47 |
|
| 48 |
def fmt(item_set):
|
| 49 |
items = [all_detections[k]['text'] for k in item_set]
|
| 50 |
+
# Limiting to first 5 items to prevent UI clutter if list is huge
|
| 51 |
+
display_items = items[:5]
|
| 52 |
+
res = ", ".join(display_items)
|
| 53 |
+
if len(items) > 5:
|
| 54 |
+
res += f", (+{len(items)-5} more)"
|
| 55 |
+
return res if res else "None"
|
| 56 |
|
| 57 |
total_count = len(total_unique_pii) if len(total_unique_pii) > 0 else 1
|
| 58 |
|
|
|
|
| 84 |
"Missed PII": fmt(presidio_missed),
|
| 85 |
"Accuracy": len(presidio_set) / total_count,
|
| 86 |
"Count": len(presidio_set)
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"Model": "π¦
GLiNER",
|
| 90 |
+
"Detected PII": fmt(gliner_set),
|
| 91 |
+
"Missed PII": fmt(gliner_missed),
|
| 92 |
+
"Accuracy": len(gliner_set) / total_count,
|
| 93 |
+
"Count": len(gliner_set)
|
| 94 |
}
|
| 95 |
]
|
| 96 |
|
| 97 |
+
# Return sorted by Accuracy descending so best model is on top
|
| 98 |
+
return pd.DataFrame(stats).sort_values(by="Accuracy", ascending=False)
|
presidio_model.py β classifier_manager/presidio_model.py
RENAMED
|
File without changes
|
classifier_manager/regex_scanner.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
class RegexScanner:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.colors = {
|
| 7 |
+
"EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9",
|
| 8 |
+
"PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea",
|
| 9 |
+
"LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
self.patterns: Dict[str, str] = {
|
| 13 |
+
"EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
|
| 14 |
+
"PHONE": r"\b(?:\+?1[-. ]?)?\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})\b",
|
| 15 |
+
"SSN": r"\b\d{3}-\d{2}-\d{4}\b",
|
| 16 |
+
"CREDIT_CARD": r"\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b",
|
| 17 |
+
"AADHAAR_IND": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b",
|
| 18 |
+
"PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
def add_pattern(self, name, regex):
|
| 22 |
+
self.patterns[name.upper()] = regex
|
| 23 |
+
|
| 24 |
+
def remove_pattern(self, name):
|
| 25 |
+
self.patterns.pop(name.upper(), None)
|
| 26 |
+
|
| 27 |
+
def scan(self, text: str) -> List[dict]:
|
| 28 |
+
"""
|
| 29 |
+
Scans text using defined Regex patterns.
|
| 30 |
+
"""
|
| 31 |
+
matches = []
|
| 32 |
+
for label, regex in self.patterns.items():
|
| 33 |
+
try:
|
| 34 |
+
for m in re.finditer(regex, text):
|
| 35 |
+
matches.append({
|
| 36 |
+
"label": label,
|
| 37 |
+
"text": m.group(),
|
| 38 |
+
"start": m.start(),
|
| 39 |
+
"end": m.end(),
|
| 40 |
+
"source": "Regex"
|
| 41 |
+
})
|
| 42 |
+
except re.error:
|
| 43 |
+
continue # Skip invalid user-defined regex
|
| 44 |
+
return matches
|
Spacy_model.py β classifier_manager/spacy_model.py
RENAMED
|
File without changes
|
connectors/__init__.py
ADDED
|
File without changes
|
connectors/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (198 Bytes). View file
|
|
|
connectors/__pycache__/aws_s3_handler.cpython-313.pyc
ADDED
|
Binary file (2.55 kB). View file
|
|
|
connectors/__pycache__/azure_handler.cpython-313.pyc
ADDED
|
Binary file (2.55 kB). View file
|
|
|
connectors/__pycache__/confluence_handler.cpython-313.pyc
ADDED
|
Binary file (2.02 kB). View file
|
|
|
connectors/__pycache__/drive_handler.cpython-313.pyc
ADDED
|
Binary file (3.18 kB). View file
|
|
|
connectors/__pycache__/gcp_storage_handler.cpython-313.pyc
ADDED
|
Binary file (3.11 kB). View file
|
|
|
connectors/__pycache__/gmail_handler.cpython-313.pyc
ADDED
|
Binary file (5.2 kB). View file
|
|
|
connectors/__pycache__/mongo_handler.cpython-313.pyc
ADDED
|
Binary file (2.32 kB). View file
|
|
|
connectors/__pycache__/mysql_handler.cpython-313.pyc
ADDED
|
Binary file (1.55 kB). View file
|
|
|
connectors/__pycache__/postgres_handler.cpython-313.pyc
ADDED
|
Binary file (1.58 kB). View file
|
|
|
connectors/__pycache__/slack_handler.cpython-313.pyc
ADDED
|
Binary file (2.6 kB). View file
|
|
|
connectors/aws_s3_handler.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import boto3
|
| 2 |
+
import io
|
| 3 |
+
|
| 4 |
+
class S3Handler:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
print("β
AWS S3 Handler loaded.")
|
| 7 |
+
|
| 8 |
+
def get_buckets(self, access_key, secret_key, region):
|
| 9 |
+
try:
|
| 10 |
+
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
|
| 11 |
+
response = s3.list_buckets()
|
| 12 |
+
return [b['Name'] for b in response.get('Buckets', [])]
|
| 13 |
+
except Exception as e:
|
| 14 |
+
print(f"β S3 Error: {e}")
|
| 15 |
+
return []
|
| 16 |
+
|
| 17 |
+
def get_files(self, access_key, secret_key, region, bucket_name):
|
| 18 |
+
try:
|
| 19 |
+
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
|
| 20 |
+
response = s3.list_objects_v2(Bucket=bucket_name)
|
| 21 |
+
return [obj['Key'] for obj in response.get('Contents', [])]
|
| 22 |
+
except Exception as e:
|
| 23 |
+
return []
|
| 24 |
+
|
| 25 |
+
def download_file(self, access_key, secret_key, region, bucket_name, file_key):
|
| 26 |
+
try:
|
| 27 |
+
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
|
| 28 |
+
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
|
| 29 |
+
return obj['Body'].read()
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"β S3 Download Error: {e}")
|
| 32 |
+
return b""
|
connectors/azure_handler.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from azure.storage.blob import BlobServiceClient
|
| 2 |
+
|
| 3 |
+
class AzureBlobHandler:
|
| 4 |
+
def __init__(self):
|
| 5 |
+
print("β
Azure Blob Handler loaded.")
|
| 6 |
+
|
| 7 |
+
def get_containers(self, conn_str):
|
| 8 |
+
try:
|
| 9 |
+
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
|
| 10 |
+
containers = blob_service_client.list_containers()
|
| 11 |
+
return [c['name'] for c in containers]
|
| 12 |
+
except Exception as e:
|
| 13 |
+
print(f"β Azure Error: {e}")
|
| 14 |
+
return []
|
| 15 |
+
|
| 16 |
+
def get_blobs(self, conn_str, container_name):
|
| 17 |
+
try:
|
| 18 |
+
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
|
| 19 |
+
container_client = blob_service_client.get_container_client(container_name)
|
| 20 |
+
blobs = container_client.list_blobs()
|
| 21 |
+
return [b['name'] for b in blobs]
|
| 22 |
+
except Exception as e:
|
| 23 |
+
return []
|
| 24 |
+
|
| 25 |
+
def download_blob(self, conn_str, container_name, blob_name):
|
| 26 |
+
try:
|
| 27 |
+
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
|
| 28 |
+
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
|
| 29 |
+
return blob_client.download_blob().readall()
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"β Azure Download Error: {e}")
|
| 32 |
+
return b""
|
connectors/confluence_handler.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from atlassian import Confluence
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
|
| 5 |
+
class ConfluenceHandler:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
print("β
Confluence Handler loaded.")
|
| 8 |
+
|
| 9 |
+
def fetch_page_content(self, url, username, api_token, page_id):
|
| 10 |
+
"""
|
| 11 |
+
Fetches the body content of a specific Confluence page.
|
| 12 |
+
"""
|
| 13 |
+
try:
|
| 14 |
+
# Initialize Confluence API
|
| 15 |
+
confluence = Confluence(
|
| 16 |
+
url=url,
|
| 17 |
+
username=username,
|
| 18 |
+
password=api_token,
|
| 19 |
+
cloud=True
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Get Page Content
|
| 23 |
+
page = confluence.get_page_by_id(page_id, expand='body.storage')
|
| 24 |
+
title = page.get('title', 'Unknown Title')
|
| 25 |
+
|
| 26 |
+
# Extract HTML body
|
| 27 |
+
raw_html = page.get('body', {}).get('storage', {}).get('value', '')
|
| 28 |
+
|
| 29 |
+
# Clean HTML tags to get raw text for PII scanning
|
| 30 |
+
if raw_html:
|
| 31 |
+
clean_text = BeautifulSoup(raw_html, "html.parser").get_text(separator=' ')
|
| 32 |
+
else:
|
| 33 |
+
clean_text = ""
|
| 34 |
+
|
| 35 |
+
return pd.DataFrame([{
|
| 36 |
+
"Source": "Confluence",
|
| 37 |
+
"Sender": username,
|
| 38 |
+
"Subject": title,
|
| 39 |
+
"Content": clean_text
|
| 40 |
+
}])
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"β Confluence Error: {e}")
|
| 44 |
+
return pd.DataFrame()
|
connectors/drive_handler.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import json
|
| 3 |
+
from googleapiclient.discovery import build
|
| 4 |
+
from googleapiclient.http import MediaIoBaseDownload
|
| 5 |
+
from google.oauth2 import service_account
|
| 6 |
+
|
| 7 |
+
class DriveHandler:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
print("β
Google Drive Handler loaded.")
|
| 10 |
+
|
| 11 |
+
def list_files(self, credentials_dict):
|
| 12 |
+
try:
|
| 13 |
+
creds = service_account.Credentials.from_service_account_info(
|
| 14 |
+
credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly']
|
| 15 |
+
)
|
| 16 |
+
service = build('drive', 'v3', credentials=creds)
|
| 17 |
+
results = service.files().list(
|
| 18 |
+
pageSize=15, fields="files(id, name, mimeType)"
|
| 19 |
+
).execute()
|
| 20 |
+
return results.get('files', [])
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"β Drive List Error: {e}")
|
| 23 |
+
return []
|
| 24 |
+
|
| 25 |
+
def download_file(self, file_id, mime_type, credentials_dict) -> bytes:
|
| 26 |
+
try:
|
| 27 |
+
creds = service_account.Credentials.from_service_account_info(
|
| 28 |
+
credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly']
|
| 29 |
+
)
|
| 30 |
+
service = build('drive', 'v3', credentials=creds)
|
| 31 |
+
|
| 32 |
+
# Export Google Docs to standard formats
|
| 33 |
+
if "spreadsheet" in mime_type:
|
| 34 |
+
request = service.files().export_media(fileId=file_id, mimeType='text/csv')
|
| 35 |
+
elif "document" in mime_type:
|
| 36 |
+
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
|
| 37 |
+
elif "presentation" in mime_type:
|
| 38 |
+
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
|
| 39 |
+
else:
|
| 40 |
+
# Download binary files directly
|
| 41 |
+
request = service.files().get_media(fileId=file_id)
|
| 42 |
+
|
| 43 |
+
fh = io.BytesIO()
|
| 44 |
+
downloader = MediaIoBaseDownload(fh, request)
|
| 45 |
+
done = False
|
| 46 |
+
while done is False:
|
| 47 |
+
status, done = downloader.next_chunk()
|
| 48 |
+
|
| 49 |
+
return fh.getvalue()
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"β Drive Download Error: {e}")
|
| 52 |
+
return b""
|
connectors/gcp_storage_handler.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from google.cloud import storage
|
| 2 |
+
from google.oauth2 import service_account
|
| 3 |
+
|
| 4 |
+
class GcpStorageHandler:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
print("β
GCP Storage Handler loaded.")
|
| 7 |
+
|
| 8 |
+
def get_buckets(self, credentials_dict):
|
| 9 |
+
try:
|
| 10 |
+
credentials = service_account.Credentials.from_service_account_info(credentials_dict)
|
| 11 |
+
storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
|
| 12 |
+
buckets = storage_client.list_buckets()
|
| 13 |
+
return [bucket.name for bucket in buckets]
|
| 14 |
+
except Exception as e:
|
| 15 |
+
print(f"β GCP Bucket Error: {e}")
|
| 16 |
+
return []
|
| 17 |
+
|
| 18 |
+
def get_files(self, credentials_dict, bucket_name):
|
| 19 |
+
try:
|
| 20 |
+
credentials = service_account.Credentials.from_service_account_info(credentials_dict)
|
| 21 |
+
storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
|
| 22 |
+
blobs = storage_client.list_blobs(bucket_name)
|
| 23 |
+
return [blob.name for blob in blobs]
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"β GCP List Error: {e}")
|
| 26 |
+
return []
|
| 27 |
+
|
| 28 |
+
def download_file(self, credentials_dict, bucket_name, blob_name):
|
| 29 |
+
try:
|
| 30 |
+
credentials = service_account.Credentials.from_service_account_info(credentials_dict)
|
| 31 |
+
storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
|
| 32 |
+
bucket = storage_client.bucket(bucket_name)
|
| 33 |
+
blob = bucket.blob(blob_name)
|
| 34 |
+
return blob.download_as_bytes()
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"β GCP Download Error: {e}")
|
| 37 |
+
return b""
|
connectors/gmail_handler.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import os
|
| 3 |
+
import pickle
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
from googleapiclient.discovery import build
|
| 7 |
+
from google_auth_oauthlib.flow import InstalledAppFlow
|
| 8 |
+
from google.auth.transport.requests import Request
|
| 9 |
+
|
| 10 |
+
class GmailHandler:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
print("β
Gmail Handler loaded.")
|
| 13 |
+
|
| 14 |
+
def fetch_emails(self, credentials_file, num_emails=10) -> pd.DataFrame:
|
| 15 |
+
"""
|
| 16 |
+
Authenticates and fetches emails from Gmail.
|
| 17 |
+
"""
|
| 18 |
+
try:
|
| 19 |
+
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
|
| 20 |
+
creds = None
|
| 21 |
+
token_path = 'token.pickle'
|
| 22 |
+
|
| 23 |
+
if os.path.exists(token_path):
|
| 24 |
+
with open(token_path, 'rb') as token:
|
| 25 |
+
creds = pickle.load(token)
|
| 26 |
+
|
| 27 |
+
if not creds or not creds.valid:
|
| 28 |
+
if creds and creds.expired and creds.refresh_token:
|
| 29 |
+
creds.refresh(Request())
|
| 30 |
+
else:
|
| 31 |
+
# Write temp file because flow requires file path
|
| 32 |
+
with open("temp_client_secret.json", "wb") as f:
|
| 33 |
+
f.write(credentials_file.getvalue())
|
| 34 |
+
|
| 35 |
+
flow = InstalledAppFlow.from_client_secrets_file('temp_client_secret.json', SCOPES)
|
| 36 |
+
creds = flow.run_local_server(port=0)
|
| 37 |
+
|
| 38 |
+
with open(token_path, 'wb') as token:
|
| 39 |
+
pickle.dump(creds, token)
|
| 40 |
+
|
| 41 |
+
if os.path.exists("temp_client_secret.json"):
|
| 42 |
+
os.remove("temp_client_secret.json")
|
| 43 |
+
|
| 44 |
+
service = build('gmail', 'v1', credentials=creds)
|
| 45 |
+
results = service.users().messages().list(userId='me', maxResults=num_emails).execute()
|
| 46 |
+
messages = results.get('messages', [])
|
| 47 |
+
|
| 48 |
+
email_data = []
|
| 49 |
+
for message in messages:
|
| 50 |
+
msg = service.users().messages().get(userId='me', id=message['id']).execute()
|
| 51 |
+
payload = msg['payload']
|
| 52 |
+
headers = payload.get("headers")
|
| 53 |
+
|
| 54 |
+
subject = next((h['value'] for h in headers if h['name'] == 'Subject'), "No Subject")
|
| 55 |
+
sender = next((h['value'] for h in headers if h['name'] == 'From'), "Unknown")
|
| 56 |
+
|
| 57 |
+
body = ""
|
| 58 |
+
if 'parts' in payload:
|
| 59 |
+
for part in payload['parts']:
|
| 60 |
+
if part['mimeType'] == 'text/plain' and 'data' in part['body']:
|
| 61 |
+
body += base64.urlsafe_b64decode(part['body']['data']).decode()
|
| 62 |
+
elif 'body' in payload and 'data' in payload['body']:
|
| 63 |
+
body += base64.urlsafe_b64decode(payload['body']['data']).decode()
|
| 64 |
+
|
| 65 |
+
clean_body = BeautifulSoup(body, "html.parser").get_text()
|
| 66 |
+
email_data.append({
|
| 67 |
+
"Source": "Gmail",
|
| 68 |
+
"Sender": sender,
|
| 69 |
+
"Subject": subject,
|
| 70 |
+
"Content": f"Subject: {subject}\n\n{clean_body}"
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
return pd.DataFrame(email_data)
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"β Gmail Error: {e}")
|
| 77 |
+
return pd.DataFrame()
|
connectors/mongo_handler.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from urllib.parse import quote_plus
|
| 3 |
+
|
| 4 |
+
class MongoHandler:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
try:
|
| 7 |
+
import pymongo
|
| 8 |
+
self.pymongo = pymongo
|
| 9 |
+
print("β
MongoDB Handler loaded.")
|
| 10 |
+
except ImportError:
|
| 11 |
+
self.pymongo = None
|
| 12 |
+
print("β PyMongo not installed.")
|
| 13 |
+
|
| 14 |
+
def fetch_data(self, host, port, db, user, pw, collection):
|
| 15 |
+
if not self.pymongo:
|
| 16 |
+
return pd.DataFrame()
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
if user and pw:
|
| 20 |
+
safe_user = quote_plus(user)
|
| 21 |
+
safe_pw = quote_plus(pw)
|
| 22 |
+
uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/"
|
| 23 |
+
else:
|
| 24 |
+
uri = f"mongodb://{host}:{port}/"
|
| 25 |
+
|
| 26 |
+
client = self.pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
|
| 27 |
+
# Check connection
|
| 28 |
+
client.server_info()
|
| 29 |
+
|
| 30 |
+
cursor = client[db][collection].find().limit(100)
|
| 31 |
+
data = list(cursor)
|
| 32 |
+
|
| 33 |
+
if not data:
|
| 34 |
+
return pd.DataFrame()
|
| 35 |
+
|
| 36 |
+
# Normalize ObjectIds to strings
|
| 37 |
+
for d in data:
|
| 38 |
+
if '_id' in d:
|
| 39 |
+
d['_id'] = str(d['_id'])
|
| 40 |
+
|
| 41 |
+
return pd.json_normalize(data)
|
| 42 |
+
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"β Mongo Error: {e}")
|
| 45 |
+
return pd.DataFrame()
|
connectors/mysql_handler.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sqlalchemy import create_engine
|
| 3 |
+
from urllib.parse import quote_plus
|
| 4 |
+
|
| 5 |
+
class MysqlHandler:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
print("β
MySQL Handler loaded.")
|
| 8 |
+
|
| 9 |
+
def fetch_data(self, host, port, db, user, pw, table):
|
| 10 |
+
"""
|
| 11 |
+
Connects to MySQL and fetches the first 100 rows of a table.
|
| 12 |
+
"""
|
| 13 |
+
try:
|
| 14 |
+
safe_pw = quote_plus(pw)
|
| 15 |
+
# Uses mysql+pymysql driver
|
| 16 |
+
conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}"
|
| 17 |
+
engine = create_engine(conn_str)
|
| 18 |
+
|
| 19 |
+
query = f"SELECT * FROM {table} LIMIT 100"
|
| 20 |
+
return pd.read_sql(query, engine)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"β MySQL Error: {e}")
|
| 23 |
+
return pd.DataFrame()
|
connectors/postgres_handler.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sqlalchemy import create_engine
|
| 3 |
+
from urllib.parse import quote_plus
|
| 4 |
+
|
| 5 |
+
class PostgresHandler:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
print("β
PostgreSQL Handler loaded.")
|
| 8 |
+
|
| 9 |
+
def fetch_data(self, host, port, db, user, pw, table):
|
| 10 |
+
"""
|
| 11 |
+
Connects to PostgreSQL and fetches the first 100 rows of a table.
|
| 12 |
+
"""
|
| 13 |
+
try:
|
| 14 |
+
safe_pw = quote_plus(pw)
|
| 15 |
+
# SQLAlchemy connection string
|
| 16 |
+
conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}"
|
| 17 |
+
engine = create_engine(conn_str)
|
| 18 |
+
|
| 19 |
+
query = f"SELECT * FROM {table} LIMIT 100"
|
| 20 |
+
return pd.read_sql(query, engine)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"β PostgreSQL Error: {e}")
|
| 23 |
+
return pd.DataFrame()
|
connectors/slack_handler.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from slack_sdk import WebClient
|
| 3 |
+
from slack_sdk.errors import SlackApiError
|
| 4 |
+
import datetime
|
| 5 |
+
|
| 6 |
+
class SlackHandler:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
print("β
Slack Handler loaded.")
|
| 9 |
+
|
| 10 |
+
def fetch_messages(self, token, channel_id, num_messages=20):
|
| 11 |
+
"""
|
| 12 |
+
Fetches recent messages from a specific Slack channel.
|
| 13 |
+
"""
|
| 14 |
+
try:
|
| 15 |
+
client = WebClient(token=token)
|
| 16 |
+
# Fetch conversation history
|
| 17 |
+
response = client.conversations_history(channel=channel_id, limit=num_messages)
|
| 18 |
+
|
| 19 |
+
messages = []
|
| 20 |
+
if response['ok']:
|
| 21 |
+
for msg in response['messages']:
|
| 22 |
+
# Skip subtypes like 'channel_join', only process actual text
|
| 23 |
+
if 'subtype' not in msg:
|
| 24 |
+
user_id = msg.get('user', 'Unknown')
|
| 25 |
+
text = msg.get('text', '')
|
| 26 |
+
ts = float(msg.get('ts', 0))
|
| 27 |
+
time_str = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
|
| 28 |
+
|
| 29 |
+
messages.append({
|
| 30 |
+
"Source": "Slack",
|
| 31 |
+
"Sender": user_id,
|
| 32 |
+
"Subject": f"Message in {channel_id} at {time_str}",
|
| 33 |
+
"Content": text
|
| 34 |
+
})
|
| 35 |
+
|
| 36 |
+
if not messages:
|
| 37 |
+
print("β οΈ No messages found in channel.")
|
| 38 |
+
return pd.DataFrame()
|
| 39 |
+
|
| 40 |
+
return pd.DataFrame(messages)
|
| 41 |
+
|
| 42 |
+
except SlackApiError as e:
|
| 43 |
+
print(f"β Slack API Error: {e.response['error']}")
|
| 44 |
+
return pd.DataFrame()
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"β Slack Handler Error: {e}")
|
| 47 |
+
return pd.DataFrame()
|
file_handlers/__init__.py
ADDED
|
File without changes
|
file_handlers/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (201 Bytes). View file
|
|
|
file_handlers/__pycache__/avro_handler.cpython-313.pyc
ADDED
|
Binary file (2.01 kB). View file
|
|
|
file_handlers/__pycache__/json_handler.cpython-313.pyc
ADDED
|
Binary file (2.37 kB). View file
|
|
|
file_handlers/__pycache__/ocr_engine.cpython-313.pyc
ADDED
|
Binary file (1.92 kB). View file
|
|
|
file_handlers/__pycache__/parquet_handler.cpython-313.pyc
ADDED
|
Binary file (1.76 kB). View file
|
|
|
file_handlers/__pycache__/pdf_handler.cpython-313.pyc
ADDED
|
Binary file (3.8 kB). View file
|
|
|
file_handlers/avro_handler.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# avro_handler.py
|
| 2 |
+
import io
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
class AvroHandler:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
self.available = False
|
| 8 |
+
try:
|
| 9 |
+
import fastavro
|
| 10 |
+
self.fastavro = fastavro
|
| 11 |
+
self.available = True
|
| 12 |
+
print("β
Avro Handler loaded.")
|
| 13 |
+
except ImportError:
|
| 14 |
+
print("β fastavro not found. Please run: pip install fastavro")
|
| 15 |
+
|
| 16 |
+
def convert_to_dataframe(self, file_bytes: bytes) -> pd.DataFrame:
|
| 17 |
+
"""
|
| 18 |
+
Reads Avro bytes and converts them to a Pandas DataFrame.
|
| 19 |
+
"""
|
| 20 |
+
if not self.available:
|
| 21 |
+
return pd.DataFrame()
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
# Create a file-like object from bytes
|
| 25 |
+
f = io.BytesIO(file_bytes)
|
| 26 |
+
# Use fastavro to read records
|
| 27 |
+
reader = self.fastavro.reader(f)
|
| 28 |
+
records = [r for r in reader]
|
| 29 |
+
|
| 30 |
+
if not records:
|
| 31 |
+
return pd.DataFrame()
|
| 32 |
+
|
| 33 |
+
return pd.DataFrame(records)
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"β οΈ Avro Read Error: {e}")
|
| 36 |
+
return pd.DataFrame()
|
file_handlers/json_handler.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import io
|
| 4 |
+
|
| 5 |
+
class JsonHandler:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
print("β
JSON Handler loaded.")
|
| 8 |
+
|
| 9 |
+
def read_file(self, file_obj) -> pd.DataFrame:
|
| 10 |
+
"""
|
| 11 |
+
Reads a JSON file object (or Streamlit UploadedFile) and flattens it.
|
| 12 |
+
"""
|
| 13 |
+
try:
|
| 14 |
+
# Handle Streamlit UploadedFile (bytes) vs standard file path
|
| 15 |
+
if hasattr(file_obj, "getvalue"):
|
| 16 |
+
content = file_obj.getvalue()
|
| 17 |
+
data = json.loads(content.decode('utf-8'))
|
| 18 |
+
else:
|
| 19 |
+
data = json.load(file_obj)
|
| 20 |
+
|
| 21 |
+
# Recursive function to flatten nested JSONs
|
| 22 |
+
def flatten(x, name=''):
|
| 23 |
+
if type(x) is dict:
|
| 24 |
+
out = {}
|
| 25 |
+
for a in x: out.update(flatten(x[a], name + a + '_'))
|
| 26 |
+
return out
|
| 27 |
+
elif type(x) is list:
|
| 28 |
+
return {f"{name}list": str(x)}
|
| 29 |
+
else: return {name[:-1]: x}
|
| 30 |
+
|
| 31 |
+
# Normalize to DataFrame
|
| 32 |
+
if isinstance(data, list):
|
| 33 |
+
return pd.DataFrame([flatten(x) for x in data])
|
| 34 |
+
|
| 35 |
+
return pd.DataFrame([flatten(data)])
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"β JSON Read Error: {e}")
|
| 39 |
+
return pd.DataFrame()
|