SHAFI commited on
Commit
cf4e4d4
Β·
1 Parent(s): a8a38df

second commit with refactored code

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. __pycache__/avro_handler.cpython-313.pyc +0 -0
  2. __pycache__/backend.cpython-313.pyc +0 -0
  3. __pycache__/gliner_model.cpython-313.pyc +0 -0
  4. __pycache__/inspector.cpython-313.pyc +0 -0
  5. __pycache__/ocr_engine.cpython-313.pyc +0 -0
  6. api.py +143 -0
  7. backend.py +173 -307
  8. new_spacy β†’ classifier_manager/__init__.py +0 -0
  9. classifier_manager/__pycache__/__init__.cpython-313.pyc +0 -0
  10. classifier_manager/__pycache__/gliner_model.cpython-313.pyc +0 -0
  11. classifier_manager/__pycache__/inspector.cpython-313.pyc +0 -0
  12. classifier_manager/__pycache__/presidio_model.cpython-313.pyc +0 -0
  13. classifier_manager/__pycache__/regex_scanner.cpython-313.pyc +0 -0
  14. classifier_manager/__pycache__/spacy_model.cpython-313.pyc +0 -0
  15. classifier_manager/gliner_model.py +81 -0
  16. inspector.py β†’ classifier_manager/inspector.py +24 -8
  17. presidio_model.py β†’ classifier_manager/presidio_model.py +0 -0
  18. classifier_manager/regex_scanner.py +44 -0
  19. Spacy_model.py β†’ classifier_manager/spacy_model.py +0 -0
  20. connectors/__init__.py +0 -0
  21. connectors/__pycache__/__init__.cpython-313.pyc +0 -0
  22. connectors/__pycache__/aws_s3_handler.cpython-313.pyc +0 -0
  23. connectors/__pycache__/azure_handler.cpython-313.pyc +0 -0
  24. connectors/__pycache__/confluence_handler.cpython-313.pyc +0 -0
  25. connectors/__pycache__/drive_handler.cpython-313.pyc +0 -0
  26. connectors/__pycache__/gcp_storage_handler.cpython-313.pyc +0 -0
  27. connectors/__pycache__/gmail_handler.cpython-313.pyc +0 -0
  28. connectors/__pycache__/mongo_handler.cpython-313.pyc +0 -0
  29. connectors/__pycache__/mysql_handler.cpython-313.pyc +0 -0
  30. connectors/__pycache__/postgres_handler.cpython-313.pyc +0 -0
  31. connectors/__pycache__/slack_handler.cpython-313.pyc +0 -0
  32. connectors/aws_s3_handler.py +32 -0
  33. connectors/azure_handler.py +32 -0
  34. connectors/confluence_handler.py +44 -0
  35. connectors/drive_handler.py +52 -0
  36. connectors/gcp_storage_handler.py +37 -0
  37. connectors/gmail_handler.py +77 -0
  38. connectors/mongo_handler.py +45 -0
  39. connectors/mysql_handler.py +23 -0
  40. connectors/postgres_handler.py +23 -0
  41. connectors/slack_handler.py +47 -0
  42. file_handlers/__init__.py +0 -0
  43. file_handlers/__pycache__/__init__.cpython-313.pyc +0 -0
  44. file_handlers/__pycache__/avro_handler.cpython-313.pyc +0 -0
  45. file_handlers/__pycache__/json_handler.cpython-313.pyc +0 -0
  46. file_handlers/__pycache__/ocr_engine.cpython-313.pyc +0 -0
  47. file_handlers/__pycache__/parquet_handler.cpython-313.pyc +0 -0
  48. file_handlers/__pycache__/pdf_handler.cpython-313.pyc +0 -0
  49. file_handlers/avro_handler.py +36 -0
  50. file_handlers/json_handler.py +39 -0
__pycache__/avro_handler.cpython-313.pyc ADDED
Binary file (2 kB). View file
 
__pycache__/backend.cpython-313.pyc CHANGED
Binary files a/__pycache__/backend.cpython-313.pyc and b/__pycache__/backend.cpython-313.pyc differ
 
__pycache__/gliner_model.cpython-313.pyc ADDED
Binary file (2.88 kB). View file
 
__pycache__/inspector.cpython-313.pyc CHANGED
Binary files a/__pycache__/inspector.cpython-313.pyc and b/__pycache__/inspector.cpython-313.pyc differ
 
__pycache__/ocr_engine.cpython-313.pyc ADDED
Binary file (1.9 kB). View file
 
api.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api.py
2
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
3
+ from pydantic import BaseModel
4
+ from typing import Optional, List
5
+ import pandas as pd
6
+ import io
7
+ import json
8
+
9
+ # Import your existing backend orchestrator
10
+ from core.backend import RegexClassifier
11
+
12
+ app = FastAPI(title="Segmento Sense API")
13
+
14
+ # Initialize the Brain
15
+ backend = RegexClassifier()
16
+
17
+ # --- Pydantic Models for Requests ---
18
+ class DbConnection(BaseModel):
19
+ type: str # postgres, mysql, mongo
20
+ host: str
21
+ port: str
22
+ user: str
23
+ password: str
24
+ database: str
25
+ collection: Optional[str] = None
26
+
27
+ class CloudConnection(BaseModel):
28
+ service: str # aws, azure, gcp
29
+ key_1: str # access_key or conn_string
30
+ key_2: Optional[str] = None # secret_key
31
+ region: Optional[str] = None
32
+ bucket: str
33
+ file_name: str
34
+
35
+ class AppConnection(BaseModel):
36
+ service: str # gmail, slack, confluence
37
+ token_or_path: str # token or credentials.json content
38
+ target: str # channel_id, page_id, or num_emails
39
+
40
+ # --- ENDPOINTS ---
41
+
42
+ @app.get("/")
43
+ def health_check():
44
+ return {"status": "Segmento Sense is running"}
45
+
46
+ @app.post("/scan/file")
47
+ async def scan_file(file: UploadFile = File(...)):
48
+ """
49
+ Handles PDF, CSV, JSON, Parquet, Avro, Image uploads.
50
+ """
51
+ file_bytes = await file.read()
52
+ filename = file.filename.lower()
53
+
54
+ df = pd.DataFrame()
55
+ raw_text = ""
56
+
57
+ # 1. Route to correct handler in backend.py
58
+ if filename.endswith(".pdf"):
59
+ # For demo, scan page 0
60
+ raw_text = backend.get_pdf_page_text(file_bytes, 0)
61
+ # Scan text
62
+ inspection = backend.run_full_inspection(raw_text)
63
+ matches = backend.analyze_text_hybrid(raw_text)
64
+ return {
65
+ "type": "unstructured",
66
+ "content": raw_text,
67
+ "matches": matches,
68
+ "stats": inspection.to_dict(orient="records")
69
+ }
70
+
71
+ elif filename.endswith((".png", ".jpg", ".jpeg")):
72
+ raw_text = backend.get_ocr_text_from_image(file_bytes)
73
+ inspection = backend.run_full_inspection(raw_text)
74
+ matches = backend.analyze_text_hybrid(raw_text)
75
+ return {
76
+ "type": "unstructured",
77
+ "content": raw_text,
78
+ "matches": matches,
79
+ "stats": inspection.to_dict(orient="records")
80
+ }
81
+
82
+ else:
83
+ # Structured Data
84
+ if filename.endswith(".csv"):
85
+ df = pd.read_csv(io.BytesIO(file_bytes))
86
+ elif filename.endswith(".json"):
87
+ df = backend.get_json_data(io.BytesIO(file_bytes))
88
+ elif filename.endswith(".parquet"):
89
+ df = backend.get_parquet_data(file_bytes)
90
+ elif filename.endswith(".avro"):
91
+ df = backend.get_avro_data(file_bytes)
92
+
93
+ # Get PII Counts
94
+ pii_counts = backend.get_pii_counts_dataframe(df)
95
+ masked_preview = backend.mask_dataframe(df.head(20))
96
+
97
+ return {
98
+ "type": "structured",
99
+ "pii_counts": pii_counts.to_dict(orient="records"),
100
+ "preview": masked_preview.to_dict(orient="records"),
101
+ "schema": backend.get_data_schema(df).to_dict(orient="records")
102
+ }
103
+
104
+ @app.post("/scan/database")
105
+ async def scan_db(conn: DbConnection):
106
+ df = pd.DataFrame()
107
+ if conn.type == "postgres":
108
+ df = backend.get_postgres_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
109
+ elif conn.type == "mysql":
110
+ df = backend.get_mysql_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
111
+ elif conn.type == "mongo":
112
+ df = backend.get_mongodb_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
113
+
114
+ if df.empty:
115
+ raise HTTPException(status_code=404, detail="Connection failed or no data found")
116
+
117
+ pii_counts = backend.get_pii_counts_dataframe(df)
118
+ return {
119
+ "source": conn.type,
120
+ "pii_counts": pii_counts.to_dict(orient="records"),
121
+ "preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records")
122
+ }
123
+
124
+ @app.post("/scan/app")
125
+ async def scan_app(conn: AppConnection):
126
+ df = pd.DataFrame()
127
+
128
+ if conn.service == "slack":
129
+ df = backend.get_slack_messages(conn.token_or_path, conn.target)
130
+ elif conn.service == "confluence":
131
+ # Split target "url|user|page_id" if needed or adjust model
132
+ # Simplified for demo: assuming backend handles auth
133
+ pass
134
+
135
+ if df.empty:
136
+ raise HTTPException(status_code=400, detail="No data fetched")
137
+
138
+ pii_counts = backend.get_pii_counts_dataframe(df)
139
+ return {
140
+ "source": conn.service,
141
+ "pii_counts": pii_counts.to_dict(orient="records"),
142
+ "preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records")
143
+ }
backend.py CHANGED
@@ -1,68 +1,66 @@
1
- # backend.py
2
  import re
3
  import json
4
  import pandas as pd
5
  import fitz # PyMuPDF
6
  import nltk
7
  import io
 
 
 
8
  from typing import Dict, List, Any
9
  from sqlalchemy import create_engine
10
  from urllib.parse import quote_plus
11
-
12
- # --- IMPORT MODULES ---
13
- from spacy_model import PiiSpacyAnalyzer
14
- from presidio_model import PiiPresidioAnalyzer
15
- from inspector import ModelInspector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # --- DEPENDENCY CHECKS ---
18
  try:
19
  from googleapiclient.discovery import build
20
- from googleapiclient.http import MediaIoBaseDownload
21
- from google.oauth2 import service_account
22
  GOOGLE_AVAILABLE = True
23
  except ImportError:
24
  GOOGLE_AVAILABLE = False
25
- print("Google Drive Libraries not installed.")
26
-
27
  try:
28
  import pymongo
29
  MONGO_AVAILABLE = True
30
- except ImportError:
31
- MONGO_AVAILABLE = False
32
- print("PyMongo not installed.")
33
-
34
- try:
35
- import pyarrow
36
- PARQUET_AVAILABLE = True
37
- except ImportError:
38
- PARQUET_AVAILABLE = False
39
- print("PyArrow not installed.")
40
-
41
  try:
42
  import boto3
43
  AWS_AVAILABLE = True
44
- except ImportError:
45
- AWS_AVAILABLE = False
46
- print("Boto3 not installed.")
47
-
48
  try:
49
  from azure.storage.blob import BlobServiceClient
50
  AZURE_AVAILABLE = True
51
- except ImportError:
52
- AZURE_AVAILABLE = False
53
- print("Azure Storage Blob not installed.")
54
-
55
- # --- GCP STORAGE IMPORT (NEW) ---
56
  try:
57
  from google.cloud import storage
58
- # We reuse google.oauth2.service_account if available, else import it
59
- from google.oauth2 import service_account as gcp_service_account
60
  GCS_AVAILABLE = True
61
- except ImportError:
62
- GCS_AVAILABLE = False
63
- print("Google Cloud Storage library not installed.")
64
 
65
- # --- NLTK SETUP ---
66
  try:
67
  nltk.data.find('tokenizers/punkt')
68
  except LookupError:
@@ -75,10 +73,9 @@ except LookupError:
75
  class RegexClassifier:
76
  def __init__(self):
77
  self.colors = {
78
- "EMAIL": (136, 238, 255), "FIRST_NAME": (170, 255, 170), "LAST_NAME": (170, 255, 170),
79
- "PHONE": (255, 170, 170), "SSN": (255, 204, 170), "CREDIT_CARD": (255, 238, 170),
80
- "LOCATION": (200, 170, 255), "AADHAAR_IND": (255, 150, 255), "ORG": (255, 255, 150),
81
- "DEFAULT": (224, 224, 224)
82
  }
83
 
84
  self.patterns: Dict[str, str] = {
@@ -90,69 +87,121 @@ class RegexClassifier:
90
  "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
91
  }
92
 
 
93
  self.spacy_analyzer = PiiSpacyAnalyzer()
94
  self.presidio_analyzer = PiiPresidioAnalyzer()
 
95
  self.inspector = ModelInspector()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def list_patterns(self): return self.patterns
98
  def add_pattern(self, n, r): self.patterns[n.upper()] = r
99
  def remove_pattern(self, n): self.patterns.pop(n.upper(), None)
100
 
101
- # --- DETECTION ENGINES ---
102
  def scan_with_regex(self, text: str) -> List[dict]:
103
  matches = []
104
  for label, regex in self.patterns.items():
105
- for match in re.finditer(regex, text):
106
- matches.append({"label": label, "text": match.group(), "start": match.start(), "end": match.end()})
107
  return matches
108
 
109
  def scan_with_nltk(self, text: str) -> List[dict]:
110
  detections = []
111
  try:
112
- tokens = nltk.word_tokenize(text)
113
- chunked = nltk.ne_chunk(nltk.pos_tag(tokens), binary=False)
114
- current_pos = 0
115
- for chunk in chunked:
116
  if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'GPE']:
117
  val = " ".join(c[0] for c in chunk)
118
- start_idx = text.find(val, current_pos)
119
- label = "LOCATION" if chunk.label() == 'GPE' else "FIRST_NAME"
120
- if start_idx != -1:
121
- detections.append({"label": label, "text": val, "start": start_idx, "end": start_idx + len(val)})
122
- current_pos = start_idx + len(val)
 
123
  except: pass
124
  return detections
125
 
126
  def analyze_text_hybrid(self, text: str) -> List[dict]:
 
127
  all_matches = []
128
  all_matches.extend(self.scan_with_regex(text))
129
  all_matches.extend(self.scan_with_nltk(text))
130
  all_matches.extend(self.spacy_analyzer.scan(text))
131
  all_matches.extend(self.presidio_analyzer.scan(text))
 
132
 
133
  all_matches.sort(key=lambda x: x['start'])
134
-
135
- unique_matches = []
136
  if not all_matches: return []
137
  curr = all_matches[0]
138
- for next_match in all_matches[1:]:
139
- if next_match['start'] < curr['end']:
140
- if len(next_match['text']) > len(curr['text']):
141
- curr = next_match
142
  else:
143
- unique_matches.append(curr)
144
- curr = next_match
145
- unique_matches.append(curr)
146
- return unique_matches
147
-
148
- def run_full_inspection(self, text: str) -> pd.DataFrame:
149
- r_matches = self.scan_with_regex(text)
150
- n_matches = self.scan_with_nltk(text)
151
- s_matches = self.spacy_analyzer.scan(text)
152
- p_matches = self.presidio_analyzer.scan(text)
153
- return self.inspector.compare_models(r_matches, n_matches, s_matches, p_matches)
154
-
155
- # --- SUMMARY & VISUALS ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def get_pii_counts(self, text: str) -> pd.DataFrame:
157
  matches = self.analyze_text_hybrid(str(text))
158
  if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
@@ -160,261 +209,78 @@ class RegexClassifier:
160
  for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
161
  return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
162
 
163
- def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
164
- full_text = " ".join(df.astype(str).values.flatten())
165
- return self.get_pii_counts(full_text)
166
-
167
- def mask_pii(self, text: str) -> str:
168
- text = str(text)
169
- matches = self.analyze_text_hybrid(text)
170
- matches.sort(key=lambda x: x['start'], reverse=True)
171
- for m in matches:
172
- masked_val = "******"
173
- if "<span" not in text[m['start']:m['end']]:
174
- text = text[:m['start']] + masked_val + text[m['end']:]
175
- return text
176
-
177
  def mask_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
178
- def safe_mask(val):
179
- if isinstance(val, (list, dict, tuple, set)): return self.mask_pii(str(val))
180
- if pd.isna(val): return val
181
- return self.mask_pii(str(val))
182
- return df.map(safe_mask)
183
-
184
- def get_labeled_pdf_image(self, file_bytes, page_num: int):
185
- try:
186
- doc = fitz.open(stream=file_bytes, filetype="pdf")
187
- if not (0 <= page_num < len(doc)): return None
188
- page = doc[page_num]
189
- text = page.get_text("text")
190
  matches = self.analyze_text_hybrid(text)
 
191
  for m in matches:
192
- color_norm = tuple(c/255 for c in self.colors.get(m['label'], self.colors["DEFAULT"]))
193
- quads = page.search_for(m['text'])
194
- for quad in quads:
195
- page.draw_rect(quad, color=color_norm, fill=color_norm, fill_opacity=0.4)
196
- page.insert_text(fitz.Point(quad.x0, quad.y0-2), m['label'], fontsize=6, color=(0,0,0))
197
- return page.get_pixmap(matrix=fitz.Matrix(2, 2)).tobytes("png")
198
- except: return None
199
 
200
  def scan_dataframe_with_html(self, df: pd.DataFrame) -> pd.DataFrame:
201
- def highlight_html(text):
202
  text = str(text)
203
  matches = self.analyze_text_hybrid(text)
204
  matches.sort(key=lambda x: x['start'], reverse=True)
205
- hex_map = {"EMAIL": "#8ef", "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea", "FIRST_NAME": "#af9", "LAST_NAME": "#af9", "LOCATION": "#dcf", "AADHAAR_IND": "#f9f", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"}
206
  for m in matches:
207
  if "<span" in text[m['start']:m['end']]: continue
208
- color = hex_map.get(m['label'], "#e0e0e0")
209
- tag = f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px; border: 1px solid #ccc;">{m["text"]}</span>'
210
- text = text[:m['start']] + tag + text[m['end']:]
211
  return text
212
- def safe_highlight(val):
213
- if isinstance(val, (list, dict)): return highlight_html(str(val))
214
- if pd.isna(val): return val
215
- return highlight_html(val)
216
- return df.map(safe_highlight)
217
-
218
- def get_data_schema(self, df: pd.DataFrame) -> pd.DataFrame:
219
- if df.empty: return pd.DataFrame(columns=["Column", "Type", "Sample"])
220
- schema_info = []
221
- for col in df.columns:
222
- d_type = str(df[col].dtype)
223
- first_valid_idx = df[col].first_valid_index()
224
- sample_val = str(df[col].loc[first_valid_idx]) if first_valid_idx is not None else "All Null"
225
- if len(sample_val) > 50: sample_val = sample_val[:47] + "..."
226
- schema_info.append({"Column Name": col, "Data Type": d_type, "Sample Value": sample_val})
227
- return pd.DataFrame(schema_info)
228
-
229
- # --- SQL/MONGO/DRIVE/S3/AZURE CONNECTORS ---
230
  def get_postgres_data(self, host, port, db, user, pw, table):
231
- safe_pw = quote_plus(pw)
232
- conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}"
233
- engine = create_engine(conn_str)
234
- return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine)
235
 
236
  def get_mysql_data(self, host, port, db, user, pw, table):
237
- safe_pw = quote_plus(pw)
238
- conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}"
239
- engine = create_engine(conn_str)
240
- return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine)
241
 
242
- def get_mongodb_data(self, host, port, db, user, pw, collection):
243
- if not MONGO_AVAILABLE: return pd.DataFrame()
244
- try:
245
- if user and pw:
246
- safe_user = quote_plus(user)
247
- safe_pw = quote_plus(pw)
248
- uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/"
249
- else:
250
- uri = f"mongodb://{host}:{port}/"
251
- client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
252
- database = client[db]
253
- col = database[collection]
254
- cursor = col.find().limit(100)
255
- data_list = list(cursor)
256
- if not data_list: return pd.DataFrame()
257
- for doc in data_list:
258
- if '_id' in doc: doc['_id'] = str(doc['_id'])
259
- return pd.json_normalize(data_list)
260
- except Exception as e:
261
- print(f"Mongo Error: {e}")
262
- raise e
263
 
264
  def get_google_drive_files(self, credentials_dict):
265
- if not GOOGLE_AVAILABLE: return []
266
- try:
267
- SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
268
- creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES)
269
- service = build('drive', 'v3', credentials=creds)
270
- return service.files().list(pageSize=15, fields="files(id, name, mimeType)").execute().get('files', [])
271
- except Exception as e:
272
- return []
273
 
274
  def download_drive_file(self, file_id, mime_type, credentials_dict):
275
- if not GOOGLE_AVAILABLE: return b""
276
- try:
277
- SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
278
- creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES)
279
- service = build('drive', 'v3', credentials=creds)
280
- if "spreadsheet" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='text/csv')
281
- elif "document" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
282
- elif "presentation" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
283
- else: request = service.files().get_media(fileId=file_id)
284
- fh = io.BytesIO()
285
- downloader = MediaIoBaseDownload(fh, request)
286
- done = False
287
- while done is False: status, done = downloader.next_chunk()
288
- return fh.getvalue()
289
- except: return b""
290
-
291
- def get_s3_buckets(self, access_key, secret_key, region):
292
- if not AWS_AVAILABLE: return []
293
- try:
294
- s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
295
- response = s3.list_buckets()
296
- return [b['Name'] for b in response.get('Buckets', [])]
297
- except Exception as e:
298
- print(f"S3 Error: {e}")
299
- return []
300
-
301
- def get_s3_files(self, access_key, secret_key, region, bucket_name):
302
- if not AWS_AVAILABLE: return []
303
- try:
304
- s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
305
- response = s3.list_objects_v2(Bucket=bucket_name)
306
- return [obj['Key'] for obj in response.get('Contents', [])]
307
- except Exception as e:
308
- return []
309
-
310
- def download_s3_file(self, access_key, secret_key, region, bucket_name, file_key):
311
- if not AWS_AVAILABLE: return b""
312
- try:
313
- s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
314
- obj = s3.get_object(Bucket=bucket_name, Key=file_key)
315
- return obj['Body'].read()
316
- except Exception as e:
317
- return b""
318
-
319
- def get_azure_containers(self, conn_str):
320
- if not AZURE_AVAILABLE: return []
321
- try:
322
- blob_service_client = BlobServiceClient.from_connection_string(conn_str)
323
- containers = blob_service_client.list_containers()
324
- return [c['name'] for c in containers]
325
- except Exception as e:
326
- print(f"Azure Error: {e}")
327
- return []
328
-
329
- def get_azure_blobs(self, conn_str, container_name):
330
- if not AZURE_AVAILABLE: return []
331
- try:
332
- blob_service_client = BlobServiceClient.from_connection_string(conn_str)
333
- container_client = blob_service_client.get_container_client(container_name)
334
- blobs = container_client.list_blobs()
335
- return [b['name'] for b in blobs]
336
- except Exception as e:
337
- return []
338
-
339
- def download_azure_blob(self, conn_str, container_name, blob_name):
340
- if not AZURE_AVAILABLE: return b""
341
- try:
342
- blob_service_client = BlobServiceClient.from_connection_string(conn_str)
343
- blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
344
- return blob_client.download_blob().readall()
345
- except Exception as e:
346
- return b""
347
-
348
- # --- GCP BUCKET CONNECTORS (NEW) ---
349
- def get_gcs_buckets(self, credentials_dict):
350
- """Lists all GCS buckets for the given service account credentials."""
351
- if not GCS_AVAILABLE: return []
352
- try:
353
- # Create credentials object
354
- credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
355
- # Create storage client
356
- storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
357
-
358
- buckets = storage_client.list_buckets()
359
- return [bucket.name for bucket in buckets]
360
- except Exception as e:
361
- print(f"GCP Bucket Error: {e}")
362
- return []
363
-
364
- def get_gcs_files(self, credentials_dict, bucket_name):
365
- """Lists files (blobs) in a specific GCS bucket."""
366
- if not GCS_AVAILABLE: return []
367
- try:
368
- credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
369
- storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
370
-
371
- blobs = storage_client.list_blobs(bucket_name)
372
- return [blob.name for blob in blobs]
373
- except Exception as e:
374
- print(f"GCP List Error: {e}")
375
- return []
376
-
377
- def download_gcs_file(self, credentials_dict, bucket_name, blob_name):
378
- """Downloads a blob from GCS to memory."""
379
- if not GCS_AVAILABLE: return b""
380
- try:
381
- credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
382
- storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
383
-
384
- bucket = storage_client.bucket(bucket_name)
385
- blob = bucket.blob(blob_name)
386
- return blob.download_as_bytes()
387
- except Exception as e:
388
- print(f"GCP Download Error: {e}")
389
- return b""
390
-
391
- # --- FILE READERS ---
392
- def get_json_data(self, file_obj) -> pd.DataFrame:
393
- data = json.load(file_obj)
394
- flat = []
395
- def recursive(d, path):
396
- if isinstance(d, dict):
397
- for k, v in d.items(): recursive(v, f"{path}.{k}" if path else k)
398
- elif isinstance(d, list):
399
- for i, v in enumerate(d): recursive(v, f"{path}[{i}]")
400
- else: flat.append({"Path": path, "Value": str(d)})
401
- recursive(data, "")
402
- return pd.DataFrame(flat)
403
 
404
- def get_parquet_data(self, file_bytes) -> pd.DataFrame:
405
- if not PARQUET_AVAILABLE: return pd.DataFrame()
406
- try:
407
- return pd.read_parquet(io.BytesIO(file_bytes))
408
- except: return pd.DataFrame()
409
-
410
- def get_pdf_total_pages(self, file_bytes) -> int:
411
- try:
412
- doc = fitz.open(stream=file_bytes, filetype="pdf")
413
- return len(doc)
414
- except: return 0
415
 
416
- def get_pdf_page_text(self, file_bytes, page_num):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  try:
418
- doc = fitz.open(stream=file_bytes, filetype="pdf")
419
- return doc[page_num].get_text("text")
420
- except: return ""
 
 
 
 
 
 
 
 
1
  import re
2
  import json
3
  import pandas as pd
4
  import fitz # PyMuPDF
5
  import nltk
6
  import io
7
+ import os
8
+ import pickle
9
+ import base64
10
  from typing import Dict, List, Any
11
  from sqlalchemy import create_engine
12
  from urllib.parse import quote_plus
13
+ from bs4 import BeautifulSoup
14
+
15
+ # --- IMPORT CLASSIFIERS ---
16
+ from classifier_manager.spacy_model import PiiSpacyAnalyzer
17
+ from classifier_manager.presidio_model import PiiPresidioAnalyzer
18
+ from classifier_manager.gliner_model import PiiGlinerAnalyzer
19
+ from classifier_manager.inspector import ModelInspector
20
+
21
+ # --- IMPORT FILE HANDLERS ---
22
+ from file_handlers.ocr_engine import OcrEngine
23
+ from file_handlers.avro_handler import AvroHandler
24
+ from file_handlers.parquet_handler import ParquetHandler
25
+ from file_handlers.json_handler import JsonHandler
26
+ from file_handlers.pdf_handler import PdfHandler
27
+
28
+ # --- IMPORT CONNECTORS ---
29
+ from connectors.postgres_handler import PostgresHandler
30
+ from connectors.mysql_handler import MysqlHandler
31
+ from connectors.gmail_handler import GmailHandler
32
+ from connectors.drive_handler import DriveHandler
33
+ from connectors.aws_s3_handler import S3Handler
34
+ from connectors.azure_handler import AzureBlobHandler
35
+ from connectors.gcp_storage_handler import GcpStorageHandler
36
+ from connectors.slack_handler import SlackHandler # <--- NEW
37
+ from connectors.confluence_handler import ConfluenceHandler # <--- NEW
38
 
39
  # --- DEPENDENCY CHECKS ---
40
  try:
41
  from googleapiclient.discovery import build
 
 
42
  GOOGLE_AVAILABLE = True
43
  except ImportError:
44
  GOOGLE_AVAILABLE = False
45
+ print("Google Libraries not installed.")
 
46
  try:
47
  import pymongo
48
  MONGO_AVAILABLE = True
49
+ except: MONGO_AVAILABLE = False
 
 
 
 
 
 
 
 
 
 
50
  try:
51
  import boto3
52
  AWS_AVAILABLE = True
53
+ except: AWS_AVAILABLE = False
 
 
 
54
  try:
55
  from azure.storage.blob import BlobServiceClient
56
  AZURE_AVAILABLE = True
57
+ except: AZURE_AVAILABLE = False
 
 
 
 
58
  try:
59
  from google.cloud import storage
 
 
60
  GCS_AVAILABLE = True
61
+ except: GCS_AVAILABLE = False
 
 
62
 
63
+ # NLTK Setup
64
  try:
65
  nltk.data.find('tokenizers/punkt')
66
  except LookupError:
 
73
  class RegexClassifier:
74
  def __init__(self):
75
  self.colors = {
76
+ "EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9",
77
+ "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea",
78
+ "LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"
 
79
  }
80
 
81
  self.patterns: Dict[str, str] = {
 
87
  "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
88
  }
89
 
90
+ # 1. Classifiers
91
  self.spacy_analyzer = PiiSpacyAnalyzer()
92
  self.presidio_analyzer = PiiPresidioAnalyzer()
93
+ self.gliner_analyzer = PiiGlinerAnalyzer()
94
  self.inspector = ModelInspector()
95
+
96
+ # 2. File Handlers
97
+ self.ocr_engine = OcrEngine()
98
+ self.avro_handler = AvroHandler()
99
+ self.parquet_handler = ParquetHandler()
100
+ self.json_handler = JsonHandler()
101
+ self.pdf_handler = PdfHandler(self.ocr_engine)
102
+
103
+ # 3. Connectors
104
+ self.pg_handler = PostgresHandler()
105
+ self.mysql_handler = MysqlHandler()
106
+ self.gmail_handler = GmailHandler()
107
+ self.drive_handler = DriveHandler()
108
+ self.s3_handler = S3Handler()
109
+ self.azure_handler = AzureBlobHandler()
110
+ self.gcp_handler = GcpStorageHandler()
111
+ self.slack_handler = SlackHandler() # <--- Init
112
+ self.confluence_handler = ConfluenceHandler() # <--- Init
113
 
114
  def list_patterns(self): return self.patterns
115
  def add_pattern(self, n, r): self.patterns[n.upper()] = r
116
  def remove_pattern(self, n): self.patterns.pop(n.upper(), None)
117
 
118
+ # --- CORE ANALYSIS ---
119
  def scan_with_regex(self, text: str) -> List[dict]:
120
  matches = []
121
  for label, regex in self.patterns.items():
122
+ for m in re.finditer(regex, text):
123
+ matches.append({"label": label, "text": m.group(), "start": m.start(), "end": m.end(), "source": "Regex"})
124
  return matches
125
 
126
  def scan_with_nltk(self, text: str) -> List[dict]:
127
  detections = []
128
  try:
129
+ for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
 
 
 
130
  if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'GPE']:
131
  val = " ".join(c[0] for c in chunk)
132
+ start = text.find(val)
133
+ if start != -1:
134
+ detections.append({
135
+ "label": "LOCATION" if chunk.label() == 'GPE' else "FIRST_NAME",
136
+ "text": val, "start": start, "end": start+len(val), "source": "NLTK"
137
+ })
138
  except: pass
139
  return detections
140
 
141
  def analyze_text_hybrid(self, text: str) -> List[dict]:
142
+ if not text: return []
143
  all_matches = []
144
  all_matches.extend(self.scan_with_regex(text))
145
  all_matches.extend(self.scan_with_nltk(text))
146
  all_matches.extend(self.spacy_analyzer.scan(text))
147
  all_matches.extend(self.presidio_analyzer.scan(text))
148
+ all_matches.extend(self.gliner_analyzer.scan(text))
149
 
150
  all_matches.sort(key=lambda x: x['start'])
151
+ unique = []
 
152
  if not all_matches: return []
153
  curr = all_matches[0]
154
+ for next_m in all_matches[1:]:
155
+ if next_m['start'] < curr['end']:
156
+ if len(next_m['text']) > len(curr['text']):
157
+ curr = next_m
158
  else:
159
+ unique.append(curr)
160
+ curr = next_m
161
+ unique.append(curr)
162
+ return unique
163
+
164
+ def run_full_inspection(self, text: str):
165
+ return self.inspector.compare_models(
166
+ self.scan_with_regex(text),
167
+ self.scan_with_nltk(text),
168
+ self.spacy_analyzer.scan(text),
169
+ self.presidio_analyzer.scan(text),
170
+ self.gliner_analyzer.scan(text)
171
+ )
172
+
173
+ # --- WRAPPERS FOR UI ---
174
+ def get_json_data(self, file_obj) -> pd.DataFrame:
175
+ return self.json_handler.read_file(file_obj)
176
+
177
+ def get_pdf_page_text(self, file_bytes, page_num):
178
+ return self.pdf_handler.get_page_text(file_bytes, page_num)
179
+
180
+ def get_pdf_total_pages(self, file_bytes) -> int:
181
+ return self.pdf_handler.get_total_pages(file_bytes)
182
+
183
+ def get_labeled_pdf_image(self, file_bytes, page_num):
184
+ text = self.get_pdf_page_text(file_bytes, page_num)
185
+ matches = self.analyze_text_hybrid(text)
186
+ return self.pdf_handler.render_labeled_image(file_bytes, page_num, matches, self.colors)
187
+
188
+ def get_avro_data(self, file_bytes) -> pd.DataFrame:
189
+ return self.avro_handler.convert_to_dataframe(file_bytes)
190
+
191
+ def get_parquet_data(self, file_bytes) -> pd.DataFrame:
192
+ return self.parquet_handler.convert_to_dataframe(file_bytes)
193
+
194
+ def get_ocr_text_from_image(self, file_bytes) -> str:
195
+ return self.ocr_engine.extract_text(file_bytes)
196
+
197
+ def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
198
+ text = " ".join(df.astype(str).values.flatten())
199
+ matches = self.analyze_text_hybrid(str(text))
200
+ if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
201
+ counts = {}
202
+ for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
203
+ return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
204
+
205
  def get_pii_counts(self, text: str) -> pd.DataFrame:
206
  matches = self.analyze_text_hybrid(str(text))
207
  if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
 
209
  for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
210
  return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def mask_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
213
+ def mask_text(text):
214
+ text = str(text)
 
 
 
 
 
 
 
 
 
 
215
  matches = self.analyze_text_hybrid(text)
216
+ matches.sort(key=lambda x: x['start'], reverse=True)
217
  for m in matches:
218
+ if "***" not in text[m['start']:m['end']]:
219
+ text = text[:m['start']] + "******" + text[m['end']:]
220
+ return text
221
+ return df.map(lambda x: mask_text(x) if isinstance(x, (str, int, float)) else x)
 
 
 
222
 
223
  def scan_dataframe_with_html(self, df: pd.DataFrame) -> pd.DataFrame:
224
+ def highlight(text):
225
  text = str(text)
226
  matches = self.analyze_text_hybrid(text)
227
  matches.sort(key=lambda x: x['start'], reverse=True)
 
228
  for m in matches:
229
  if "<span" in text[m['start']:m['end']]: continue
230
+ color = self.colors.get(m['label'], self.colors["DEFAULT"])
231
+ replacement = f'<span style="background:{color}; padding:2px; border-radius:4px;">{m["text"]}</span>'
232
+ text = text[:m['start']] + replacement + text[m['end']:]
233
  return text
234
+ return df.map(lambda x: highlight(x) if isinstance(x, str) else x)
235
+
236
+ def get_data_schema(self, df):
237
+ return pd.DataFrame({"Column": df.columns, "Type": df.dtypes.astype(str)})
238
+
239
+ # --- CONNECTOR WRAPPERS ---
 
 
 
 
 
 
 
 
 
 
 
 
240
  def get_postgres_data(self, host, port, db, user, pw, table):
241
+ return self.pg_handler.fetch_data(host, port, db, user, pw, table)
 
 
 
242
 
243
  def get_mysql_data(self, host, port, db, user, pw, table):
244
+ return self.mysql_handler.fetch_data(host, port, db, user, pw, table)
 
 
 
245
 
246
+ def get_gmail_data(self, credentials_file, num_emails=10) -> pd.DataFrame:
247
+ return self.gmail_handler.fetch_emails(credentials_file, num_emails)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  def get_google_drive_files(self, credentials_dict):
250
+ return self.drive_handler.list_files(credentials_dict)
 
 
 
 
 
 
 
251
 
252
  def download_drive_file(self, file_id, mime_type, credentials_dict):
253
+ return self.drive_handler.download_file(file_id, mime_type, credentials_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ def get_s3_buckets(self, a, s, r): return self.s3_handler.get_buckets(a, s, r)
256
+ def get_s3_files(self, a, s, r, b): return self.s3_handler.get_files(a, s, r, b)
257
+ def download_s3_file(self, a, s, r, b, k): return self.s3_handler.download_file(a, s, r, b, k)
 
 
 
 
 
 
 
 
258
 
259
+ def get_azure_containers(self, c): return self.azure_handler.get_containers(c)
260
+ def get_azure_blobs(self, c, n): return self.azure_handler.get_blobs(c, n)
261
+ def download_azure_blob(self, c, n, b): return self.azure_handler.download_blob(c, n, b)
262
+
263
+ def get_gcs_buckets(self, c): return self.gcp_handler.get_buckets(c)
264
+ def get_gcs_files(self, c, b): return self.gcp_handler.get_files(c, b)
265
+ def download_gcs_file(self, c, b, n): return self.gcp_handler.download_file(c, b, n)
266
+
267
+ # --- NEW WRAPPERS FOR SLACK & CONFLUENCE ---
268
+ def get_slack_messages(self, token, channel_id):
269
+ return self.slack_handler.fetch_messages(token, channel_id)
270
+
271
+ def get_confluence_page(self, url, username, token, page_id):
272
+ return self.confluence_handler.fetch_page_content(url, username, token, page_id)
273
+
274
+ # --- MONGO (Still here) ---
275
+ def get_mongodb_data(self, host, port, db, user, pw, collection):
276
+ if not MONGO_AVAILABLE: return pd.DataFrame()
277
  try:
278
+ if user and pw: uri = f"mongodb://{quote_plus(user)}:{quote_plus(pw)}@{host}:{port}/"
279
+ else: uri = f"mongodb://{host}:{port}/"
280
+ client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
281
+ cursor = client[db][collection].find().limit(100)
282
+ data = list(cursor)
283
+ if not data: return pd.DataFrame()
284
+ for d in data: d['_id'] = str(d.get('_id', ''))
285
+ return pd.json_normalize(data)
286
+ except: return pd.DataFrame()
new_spacy β†’ classifier_manager/__init__.py RENAMED
File without changes
classifier_manager/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (206 Bytes). View file
 
classifier_manager/__pycache__/gliner_model.cpython-313.pyc ADDED
Binary file (2.9 kB). View file
 
classifier_manager/__pycache__/inspector.cpython-313.pyc ADDED
Binary file (3.8 kB). View file
 
classifier_manager/__pycache__/presidio_model.cpython-313.pyc ADDED
Binary file (2.97 kB). View file
 
classifier_manager/__pycache__/regex_scanner.cpython-313.pyc ADDED
Binary file (2.51 kB). View file
 
classifier_manager/__pycache__/spacy_model.cpython-313.pyc ADDED
Binary file (2.85 kB). View file
 
classifier_manager/gliner_model.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gliner import GLiNER
2
+
3
+ class PiiGlinerAnalyzer:
4
+ def __init__(self, model_name="urchade/gliner_small-v2.1"):
5
+ """
6
+ Initializes the GLiNER model.
7
+ Uses a small, efficient BERT-based model by default.
8
+ """
9
+ self.model = None
10
+ self.available = False
11
+
12
+ # Define the natural language labels you want GLiNER to look for.
13
+ # These are used as prompts for the model.
14
+ self.labels = [
15
+ "person",
16
+ "email",
17
+ "phone number",
18
+ "credit card",
19
+ "social security number",
20
+ "organization",
21
+ "location",
22
+ "date",
23
+ "ip address",
24
+ "passport number",
25
+ "driver license"
26
+ ]
27
+
28
+ try:
29
+ print(f"⏳ Loading GLiNER model: {model_name}...")
30
+ # This will download the model to your local cache on the first run
31
+ self.model = GLiNER.from_pretrained(model_name)
32
+ self.available = True
33
+ print("βœ… GLiNER model loaded successfully.")
34
+ except Exception as e:
35
+ print(f"❌ Error loading GLiNER: {e}")
36
+
37
+ def scan(self, text: str) -> list:
38
+ """
39
+ Scans text using GLiNER and normalizes the output for the Inspector.
40
+ """
41
+ if not self.available or not text or not text.strip():
42
+ return []
43
+
44
+ try:
45
+ # GLiNER takes text and a list of labels as input
46
+ # Threshold 0.5 is a good balance for the small model
47
+ entities = self.model.predict_entities(text, self.labels, threshold=0.5)
48
+
49
+ detections = []
50
+
51
+ # Map GLiNER's lowercase output labels to your App's standard uppercase keys
52
+ # to ensure consistency in the UI and Inspector.
53
+ label_map = {
54
+ "person": "FIRST_NAME",
55
+ "phone number": "PHONE",
56
+ "social security number": "SSN",
57
+ "organization": "ORG",
58
+ "location": "LOCATION",
59
+ "ip address": "IP_ADDRESS",
60
+ "credit card": "CREDIT_CARD",
61
+ "email": "EMAIL",
62
+ "date": "DATE_TIME",
63
+ "passport number": "PASSPORT",
64
+ "driver license": "DRIVER_LICENSE"
65
+ }
66
+
67
+ for ent in entities:
68
+ detections.append({
69
+ "label": label_map.get(ent["label"], ent["label"].upper().replace(" ", "_")),
70
+ "text": ent["text"],
71
+ "start": ent["start"],
72
+ "end": ent["end"],
73
+ "score": ent["score"],
74
+ "source": "GLiNER" # Helpful metadata
75
+ })
76
+
77
+ return detections
78
+
79
+ except Exception as e:
80
+ print(f"⚠️ GLiNER Scan Error: {e}")
81
+ return []
inspector.py β†’ classifier_manager/inspector.py RENAMED
@@ -12,16 +12,17 @@ class ModelInspector:
12
  "end": match["end"]
13
  }
14
 
15
- def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches):
16
  """
17
- Compares 4 lists of matches to find Unique vs Missed PII.
 
18
  """
19
  all_detections = {}
20
 
21
  def add_to_master(matches, model_name):
22
  found_set = set()
23
  for m in matches:
24
- # Use tuple key for uniqueness
25
  key = (m['start'], m['end'], m['text'])
26
  if key not in all_detections:
27
  all_detections[key] = {'text': m['text'], 'label': m['label']}
@@ -32,19 +33,26 @@ class ModelInspector:
32
  regex_set = add_to_master(regex_matches, "Regex")
33
  nltk_set = add_to_master(nltk_matches, "NLTK")
34
  spacy_set = add_to_master(spacy_matches, "SpaCy")
35
- presidio_set = add_to_master(presidio_matches, "Presidio") # <--- Added Presidio
 
36
 
37
- # 2. Calculate "Missed" Data
38
  total_unique_pii = set(all_detections.keys())
39
 
40
  regex_missed = total_unique_pii - regex_set
41
  nltk_missed = total_unique_pii - nltk_set
42
  spacy_missed = total_unique_pii - spacy_set
43
- presidio_missed = total_unique_pii - presidio_set # <--- Added Presidio
 
44
 
45
  def fmt(item_set):
46
  items = [all_detections[k]['text'] for k in item_set]
47
- return ", ".join(items) if items else "None"
 
 
 
 
 
48
 
49
  total_count = len(total_unique_pii) if len(total_unique_pii) > 0 else 1
50
 
@@ -76,7 +84,15 @@ class ModelInspector:
76
  "Missed PII": fmt(presidio_missed),
77
  "Accuracy": len(presidio_set) / total_count,
78
  "Count": len(presidio_set)
 
 
 
 
 
 
 
79
  }
80
  ]
81
 
82
- return pd.DataFrame(stats)
 
 
12
  "end": match["end"]
13
  }
14
 
15
+ def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches, gliner_matches):
16
  """
17
+ Compares 5 lists of matches to find Unique vs Missed PII.
18
+ Added GLiNER to the comparison logic.
19
  """
20
  all_detections = {}
21
 
22
  def add_to_master(matches, model_name):
23
  found_set = set()
24
  for m in matches:
25
+ # Use tuple key for uniqueness: (start, end, text)
26
  key = (m['start'], m['end'], m['text'])
27
  if key not in all_detections:
28
  all_detections[key] = {'text': m['text'], 'label': m['label']}
 
33
  regex_set = add_to_master(regex_matches, "Regex")
34
  nltk_set = add_to_master(nltk_matches, "NLTK")
35
  spacy_set = add_to_master(spacy_matches, "SpaCy")
36
+ presidio_set = add_to_master(presidio_matches, "Presidio")
37
+ gliner_set = add_to_master(gliner_matches, "GLiNER") # <--- Added GLiNER
38
 
39
+ # 2. Calculate "Missed" Data (Union of all models)
40
  total_unique_pii = set(all_detections.keys())
41
 
42
  regex_missed = total_unique_pii - regex_set
43
  nltk_missed = total_unique_pii - nltk_set
44
  spacy_missed = total_unique_pii - spacy_set
45
+ presidio_missed = total_unique_pii - presidio_set
46
+ gliner_missed = total_unique_pii - gliner_set # <--- Added GLiNER
47
 
48
  def fmt(item_set):
49
  items = [all_detections[k]['text'] for k in item_set]
50
+ # Limiting to first 5 items to prevent UI clutter if list is huge
51
+ display_items = items[:5]
52
+ res = ", ".join(display_items)
53
+ if len(items) > 5:
54
+ res += f", (+{len(items)-5} more)"
55
+ return res if res else "None"
56
 
57
  total_count = len(total_unique_pii) if len(total_unique_pii) > 0 else 1
58
 
 
84
  "Missed PII": fmt(presidio_missed),
85
  "Accuracy": len(presidio_set) / total_count,
86
  "Count": len(presidio_set)
87
+ },
88
+ {
89
+ "Model": "πŸ¦… GLiNER",
90
+ "Detected PII": fmt(gliner_set),
91
+ "Missed PII": fmt(gliner_missed),
92
+ "Accuracy": len(gliner_set) / total_count,
93
+ "Count": len(gliner_set)
94
  }
95
  ]
96
 
97
+ # Return sorted by Accuracy descending so best model is on top
98
+ return pd.DataFrame(stats).sort_values(by="Accuracy", ascending=False)
presidio_model.py β†’ classifier_manager/presidio_model.py RENAMED
File without changes
classifier_manager/regex_scanner.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List
3
+
4
+ class RegexScanner:
5
+ def __init__(self):
6
+ self.colors = {
7
+ "EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9",
8
+ "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea",
9
+ "LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"
10
+ }
11
+
12
+ self.patterns: Dict[str, str] = {
13
+ "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
14
+ "PHONE": r"\b(?:\+?1[-. ]?)?\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})\b",
15
+ "SSN": r"\b\d{3}-\d{2}-\d{4}\b",
16
+ "CREDIT_CARD": r"\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b",
17
+ "AADHAAR_IND": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b",
18
+ "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
19
+ }
20
+
21
+ def add_pattern(self, name, regex):
22
+ self.patterns[name.upper()] = regex
23
+
24
+ def remove_pattern(self, name):
25
+ self.patterns.pop(name.upper(), None)
26
+
27
+ def scan(self, text: str) -> List[dict]:
28
+ """
29
+ Scans text using defined Regex patterns.
30
+ """
31
+ matches = []
32
+ for label, regex in self.patterns.items():
33
+ try:
34
+ for m in re.finditer(regex, text):
35
+ matches.append({
36
+ "label": label,
37
+ "text": m.group(),
38
+ "start": m.start(),
39
+ "end": m.end(),
40
+ "source": "Regex"
41
+ })
42
+ except re.error:
43
+ continue # Skip invalid user-defined regex
44
+ return matches
Spacy_model.py β†’ classifier_manager/spacy_model.py RENAMED
File without changes
connectors/__init__.py ADDED
File without changes
connectors/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (198 Bytes). View file
 
connectors/__pycache__/aws_s3_handler.cpython-313.pyc ADDED
Binary file (2.55 kB). View file
 
connectors/__pycache__/azure_handler.cpython-313.pyc ADDED
Binary file (2.55 kB). View file
 
connectors/__pycache__/confluence_handler.cpython-313.pyc ADDED
Binary file (2.02 kB). View file
 
connectors/__pycache__/drive_handler.cpython-313.pyc ADDED
Binary file (3.18 kB). View file
 
connectors/__pycache__/gcp_storage_handler.cpython-313.pyc ADDED
Binary file (3.11 kB). View file
 
connectors/__pycache__/gmail_handler.cpython-313.pyc ADDED
Binary file (5.2 kB). View file
 
connectors/__pycache__/mongo_handler.cpython-313.pyc ADDED
Binary file (2.32 kB). View file
 
connectors/__pycache__/mysql_handler.cpython-313.pyc ADDED
Binary file (1.55 kB). View file
 
connectors/__pycache__/postgres_handler.cpython-313.pyc ADDED
Binary file (1.58 kB). View file
 
connectors/__pycache__/slack_handler.cpython-313.pyc ADDED
Binary file (2.6 kB). View file
 
connectors/aws_s3_handler.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import io
3
+
4
+ class S3Handler:
5
+ def __init__(self):
6
+ print("βœ… AWS S3 Handler loaded.")
7
+
8
+ def get_buckets(self, access_key, secret_key, region):
9
+ try:
10
+ s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
11
+ response = s3.list_buckets()
12
+ return [b['Name'] for b in response.get('Buckets', [])]
13
+ except Exception as e:
14
+ print(f"❌ S3 Error: {e}")
15
+ return []
16
+
17
+ def get_files(self, access_key, secret_key, region, bucket_name):
18
+ try:
19
+ s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
20
+ response = s3.list_objects_v2(Bucket=bucket_name)
21
+ return [obj['Key'] for obj in response.get('Contents', [])]
22
+ except Exception as e:
23
+ return []
24
+
25
+ def download_file(self, access_key, secret_key, region, bucket_name, file_key):
26
+ try:
27
+ s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
28
+ obj = s3.get_object(Bucket=bucket_name, Key=file_key)
29
+ return obj['Body'].read()
30
+ except Exception as e:
31
+ print(f"❌ S3 Download Error: {e}")
32
+ return b""
connectors/azure_handler.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from azure.storage.blob import BlobServiceClient
2
+
3
+ class AzureBlobHandler:
4
+ def __init__(self):
5
+ print("βœ… Azure Blob Handler loaded.")
6
+
7
+ def get_containers(self, conn_str):
8
+ try:
9
+ blob_service_client = BlobServiceClient.from_connection_string(conn_str)
10
+ containers = blob_service_client.list_containers()
11
+ return [c['name'] for c in containers]
12
+ except Exception as e:
13
+ print(f"❌ Azure Error: {e}")
14
+ return []
15
+
16
+ def get_blobs(self, conn_str, container_name):
17
+ try:
18
+ blob_service_client = BlobServiceClient.from_connection_string(conn_str)
19
+ container_client = blob_service_client.get_container_client(container_name)
20
+ blobs = container_client.list_blobs()
21
+ return [b['name'] for b in blobs]
22
+ except Exception as e:
23
+ return []
24
+
25
+ def download_blob(self, conn_str, container_name, blob_name):
26
+ try:
27
+ blob_service_client = BlobServiceClient.from_connection_string(conn_str)
28
+ blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
29
+ return blob_client.download_blob().readall()
30
+ except Exception as e:
31
+ print(f"❌ Azure Download Error: {e}")
32
+ return b""
connectors/confluence_handler.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from atlassian import Confluence
3
+ from bs4 import BeautifulSoup
4
+
5
+ class ConfluenceHandler:
6
+ def __init__(self):
7
+ print("βœ… Confluence Handler loaded.")
8
+
9
+ def fetch_page_content(self, url, username, api_token, page_id):
10
+ """
11
+ Fetches the body content of a specific Confluence page.
12
+ """
13
+ try:
14
+ # Initialize Confluence API
15
+ confluence = Confluence(
16
+ url=url,
17
+ username=username,
18
+ password=api_token,
19
+ cloud=True
20
+ )
21
+
22
+ # Get Page Content
23
+ page = confluence.get_page_by_id(page_id, expand='body.storage')
24
+ title = page.get('title', 'Unknown Title')
25
+
26
+ # Extract HTML body
27
+ raw_html = page.get('body', {}).get('storage', {}).get('value', '')
28
+
29
+ # Clean HTML tags to get raw text for PII scanning
30
+ if raw_html:
31
+ clean_text = BeautifulSoup(raw_html, "html.parser").get_text(separator=' ')
32
+ else:
33
+ clean_text = ""
34
+
35
+ return pd.DataFrame([{
36
+ "Source": "Confluence",
37
+ "Sender": username,
38
+ "Subject": title,
39
+ "Content": clean_text
40
+ }])
41
+
42
+ except Exception as e:
43
+ print(f"❌ Confluence Error: {e}")
44
+ return pd.DataFrame()
connectors/drive_handler.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ from googleapiclient.discovery import build
4
+ from googleapiclient.http import MediaIoBaseDownload
5
+ from google.oauth2 import service_account
6
+
7
+ class DriveHandler:
8
+ def __init__(self):
9
+ print("βœ… Google Drive Handler loaded.")
10
+
11
+ def list_files(self, credentials_dict):
12
+ try:
13
+ creds = service_account.Credentials.from_service_account_info(
14
+ credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly']
15
+ )
16
+ service = build('drive', 'v3', credentials=creds)
17
+ results = service.files().list(
18
+ pageSize=15, fields="files(id, name, mimeType)"
19
+ ).execute()
20
+ return results.get('files', [])
21
+ except Exception as e:
22
+ print(f"❌ Drive List Error: {e}")
23
+ return []
24
+
25
+ def download_file(self, file_id, mime_type, credentials_dict) -> bytes:
26
+ try:
27
+ creds = service_account.Credentials.from_service_account_info(
28
+ credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly']
29
+ )
30
+ service = build('drive', 'v3', credentials=creds)
31
+
32
+ # Export Google Docs to standard formats
33
+ if "spreadsheet" in mime_type:
34
+ request = service.files().export_media(fileId=file_id, mimeType='text/csv')
35
+ elif "document" in mime_type:
36
+ request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
37
+ elif "presentation" in mime_type:
38
+ request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
39
+ else:
40
+ # Download binary files directly
41
+ request = service.files().get_media(fileId=file_id)
42
+
43
+ fh = io.BytesIO()
44
+ downloader = MediaIoBaseDownload(fh, request)
45
+ done = False
46
+ while done is False:
47
+ status, done = downloader.next_chunk()
48
+
49
+ return fh.getvalue()
50
+ except Exception as e:
51
+ print(f"❌ Drive Download Error: {e}")
52
+ return b""
connectors/gcp_storage_handler.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google.cloud import storage
2
+ from google.oauth2 import service_account
3
+
4
+ class GcpStorageHandler:
5
+ def __init__(self):
6
+ print("βœ… GCP Storage Handler loaded.")
7
+
8
+ def get_buckets(self, credentials_dict):
9
+ try:
10
+ credentials = service_account.Credentials.from_service_account_info(credentials_dict)
11
+ storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
12
+ buckets = storage_client.list_buckets()
13
+ return [bucket.name for bucket in buckets]
14
+ except Exception as e:
15
+ print(f"❌ GCP Bucket Error: {e}")
16
+ return []
17
+
18
+ def get_files(self, credentials_dict, bucket_name):
19
+ try:
20
+ credentials = service_account.Credentials.from_service_account_info(credentials_dict)
21
+ storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
22
+ blobs = storage_client.list_blobs(bucket_name)
23
+ return [blob.name for blob in blobs]
24
+ except Exception as e:
25
+ print(f"❌ GCP List Error: {e}")
26
+ return []
27
+
28
+ def download_file(self, credentials_dict, bucket_name, blob_name):
29
+ try:
30
+ credentials = service_account.Credentials.from_service_account_info(credentials_dict)
31
+ storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
32
+ bucket = storage_client.bucket(bucket_name)
33
+ blob = bucket.blob(blob_name)
34
+ return blob.download_as_bytes()
35
+ except Exception as e:
36
+ print(f"❌ GCP Download Error: {e}")
37
+ return b""
connectors/gmail_handler.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import pickle
4
+ import pandas as pd
5
+ from bs4 import BeautifulSoup
6
+ from googleapiclient.discovery import build
7
+ from google_auth_oauthlib.flow import InstalledAppFlow
8
+ from google.auth.transport.requests import Request
9
+
10
+ class GmailHandler:
11
+ def __init__(self):
12
+ print("βœ… Gmail Handler loaded.")
13
+
14
+ def fetch_emails(self, credentials_file, num_emails=10) -> pd.DataFrame:
15
+ """
16
+ Authenticates and fetches emails from Gmail.
17
+ """
18
+ try:
19
+ SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
20
+ creds = None
21
+ token_path = 'token.pickle'
22
+
23
+ if os.path.exists(token_path):
24
+ with open(token_path, 'rb') as token:
25
+ creds = pickle.load(token)
26
+
27
+ if not creds or not creds.valid:
28
+ if creds and creds.expired and creds.refresh_token:
29
+ creds.refresh(Request())
30
+ else:
31
+ # Write temp file because flow requires file path
32
+ with open("temp_client_secret.json", "wb") as f:
33
+ f.write(credentials_file.getvalue())
34
+
35
+ flow = InstalledAppFlow.from_client_secrets_file('temp_client_secret.json', SCOPES)
36
+ creds = flow.run_local_server(port=0)
37
+
38
+ with open(token_path, 'wb') as token:
39
+ pickle.dump(creds, token)
40
+
41
+ if os.path.exists("temp_client_secret.json"):
42
+ os.remove("temp_client_secret.json")
43
+
44
+ service = build('gmail', 'v1', credentials=creds)
45
+ results = service.users().messages().list(userId='me', maxResults=num_emails).execute()
46
+ messages = results.get('messages', [])
47
+
48
+ email_data = []
49
+ for message in messages:
50
+ msg = service.users().messages().get(userId='me', id=message['id']).execute()
51
+ payload = msg['payload']
52
+ headers = payload.get("headers")
53
+
54
+ subject = next((h['value'] for h in headers if h['name'] == 'Subject'), "No Subject")
55
+ sender = next((h['value'] for h in headers if h['name'] == 'From'), "Unknown")
56
+
57
+ body = ""
58
+ if 'parts' in payload:
59
+ for part in payload['parts']:
60
+ if part['mimeType'] == 'text/plain' and 'data' in part['body']:
61
+ body += base64.urlsafe_b64decode(part['body']['data']).decode()
62
+ elif 'body' in payload and 'data' in payload['body']:
63
+ body += base64.urlsafe_b64decode(payload['body']['data']).decode()
64
+
65
+ clean_body = BeautifulSoup(body, "html.parser").get_text()
66
+ email_data.append({
67
+ "Source": "Gmail",
68
+ "Sender": sender,
69
+ "Subject": subject,
70
+ "Content": f"Subject: {subject}\n\n{clean_body}"
71
+ })
72
+
73
+ return pd.DataFrame(email_data)
74
+
75
+ except Exception as e:
76
+ print(f"❌ Gmail Error: {e}")
77
+ return pd.DataFrame()
connectors/mongo_handler.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from urllib.parse import quote_plus
3
+
4
+ class MongoHandler:
5
+ def __init__(self):
6
+ try:
7
+ import pymongo
8
+ self.pymongo = pymongo
9
+ print("βœ… MongoDB Handler loaded.")
10
+ except ImportError:
11
+ self.pymongo = None
12
+ print("❌ PyMongo not installed.")
13
+
14
+ def fetch_data(self, host, port, db, user, pw, collection):
15
+ if not self.pymongo:
16
+ return pd.DataFrame()
17
+
18
+ try:
19
+ if user and pw:
20
+ safe_user = quote_plus(user)
21
+ safe_pw = quote_plus(pw)
22
+ uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/"
23
+ else:
24
+ uri = f"mongodb://{host}:{port}/"
25
+
26
+ client = self.pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
27
+ # Check connection
28
+ client.server_info()
29
+
30
+ cursor = client[db][collection].find().limit(100)
31
+ data = list(cursor)
32
+
33
+ if not data:
34
+ return pd.DataFrame()
35
+
36
+ # Normalize ObjectIds to strings
37
+ for d in data:
38
+ if '_id' in d:
39
+ d['_id'] = str(d['_id'])
40
+
41
+ return pd.json_normalize(data)
42
+
43
+ except Exception as e:
44
+ print(f"❌ Mongo Error: {e}")
45
+ return pd.DataFrame()
connectors/mysql_handler.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sqlalchemy import create_engine
3
+ from urllib.parse import quote_plus
4
+
5
+ class MysqlHandler:
6
+ def __init__(self):
7
+ print("βœ… MySQL Handler loaded.")
8
+
9
+ def fetch_data(self, host, port, db, user, pw, table):
10
+ """
11
+ Connects to MySQL and fetches the first 100 rows of a table.
12
+ """
13
+ try:
14
+ safe_pw = quote_plus(pw)
15
+ # Uses mysql+pymysql driver
16
+ conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}"
17
+ engine = create_engine(conn_str)
18
+
19
+ query = f"SELECT * FROM {table} LIMIT 100"
20
+ return pd.read_sql(query, engine)
21
+ except Exception as e:
22
+ print(f"❌ MySQL Error: {e}")
23
+ return pd.DataFrame()
connectors/postgres_handler.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sqlalchemy import create_engine
3
+ from urllib.parse import quote_plus
4
+
5
+ class PostgresHandler:
6
+ def __init__(self):
7
+ print("βœ… PostgreSQL Handler loaded.")
8
+
9
+ def fetch_data(self, host, port, db, user, pw, table):
10
+ """
11
+ Connects to PostgreSQL and fetches the first 100 rows of a table.
12
+ """
13
+ try:
14
+ safe_pw = quote_plus(pw)
15
+ # SQLAlchemy connection string
16
+ conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}"
17
+ engine = create_engine(conn_str)
18
+
19
+ query = f"SELECT * FROM {table} LIMIT 100"
20
+ return pd.read_sql(query, engine)
21
+ except Exception as e:
22
+ print(f"❌ PostgreSQL Error: {e}")
23
+ return pd.DataFrame()
connectors/slack_handler.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from slack_sdk import WebClient
3
+ from slack_sdk.errors import SlackApiError
4
+ import datetime
5
+
6
+ class SlackHandler:
7
+ def __init__(self):
8
+ print("βœ… Slack Handler loaded.")
9
+
10
+ def fetch_messages(self, token, channel_id, num_messages=20):
11
+ """
12
+ Fetches recent messages from a specific Slack channel.
13
+ """
14
+ try:
15
+ client = WebClient(token=token)
16
+ # Fetch conversation history
17
+ response = client.conversations_history(channel=channel_id, limit=num_messages)
18
+
19
+ messages = []
20
+ if response['ok']:
21
+ for msg in response['messages']:
22
+ # Skip subtypes like 'channel_join', only process actual text
23
+ if 'subtype' not in msg:
24
+ user_id = msg.get('user', 'Unknown')
25
+ text = msg.get('text', '')
26
+ ts = float(msg.get('ts', 0))
27
+ time_str = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
28
+
29
+ messages.append({
30
+ "Source": "Slack",
31
+ "Sender": user_id,
32
+ "Subject": f"Message in {channel_id} at {time_str}",
33
+ "Content": text
34
+ })
35
+
36
+ if not messages:
37
+ print("⚠️ No messages found in channel.")
38
+ return pd.DataFrame()
39
+
40
+ return pd.DataFrame(messages)
41
+
42
+ except SlackApiError as e:
43
+ print(f"❌ Slack API Error: {e.response['error']}")
44
+ return pd.DataFrame()
45
+ except Exception as e:
46
+ print(f"❌ Slack Handler Error: {e}")
47
+ return pd.DataFrame()
file_handlers/__init__.py ADDED
File without changes
file_handlers/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (201 Bytes). View file
 
file_handlers/__pycache__/avro_handler.cpython-313.pyc ADDED
Binary file (2.01 kB). View file
 
file_handlers/__pycache__/json_handler.cpython-313.pyc ADDED
Binary file (2.37 kB). View file
 
file_handlers/__pycache__/ocr_engine.cpython-313.pyc ADDED
Binary file (1.92 kB). View file
 
file_handlers/__pycache__/parquet_handler.cpython-313.pyc ADDED
Binary file (1.76 kB). View file
 
file_handlers/__pycache__/pdf_handler.cpython-313.pyc ADDED
Binary file (3.8 kB). View file
 
file_handlers/avro_handler.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # avro_handler.py
2
+ import io
3
+ import pandas as pd
4
+
5
+ class AvroHandler:
6
+ def __init__(self):
7
+ self.available = False
8
+ try:
9
+ import fastavro
10
+ self.fastavro = fastavro
11
+ self.available = True
12
+ print("βœ… Avro Handler loaded.")
13
+ except ImportError:
14
+ print("❌ fastavro not found. Please run: pip install fastavro")
15
+
16
+ def convert_to_dataframe(self, file_bytes: bytes) -> pd.DataFrame:
17
+ """
18
+ Reads Avro bytes and converts them to a Pandas DataFrame.
19
+ """
20
+ if not self.available:
21
+ return pd.DataFrame()
22
+
23
+ try:
24
+ # Create a file-like object from bytes
25
+ f = io.BytesIO(file_bytes)
26
+ # Use fastavro to read records
27
+ reader = self.fastavro.reader(f)
28
+ records = [r for r in reader]
29
+
30
+ if not records:
31
+ return pd.DataFrame()
32
+
33
+ return pd.DataFrame(records)
34
+ except Exception as e:
35
+ print(f"⚠️ Avro Read Error: {e}")
36
+ return pd.DataFrame()
file_handlers/json_handler.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import io
4
+
5
+ class JsonHandler:
6
+ def __init__(self):
7
+ print("βœ… JSON Handler loaded.")
8
+
9
+ def read_file(self, file_obj) -> pd.DataFrame:
10
+ """
11
+ Reads a JSON file object (or Streamlit UploadedFile) and flattens it.
12
+ """
13
+ try:
14
+ # Handle Streamlit UploadedFile (bytes) vs standard file path
15
+ if hasattr(file_obj, "getvalue"):
16
+ content = file_obj.getvalue()
17
+ data = json.loads(content.decode('utf-8'))
18
+ else:
19
+ data = json.load(file_obj)
20
+
21
+ # Recursive function to flatten nested JSONs
22
+ def flatten(x, name=''):
23
+ if type(x) is dict:
24
+ out = {}
25
+ for a in x: out.update(flatten(x[a], name + a + '_'))
26
+ return out
27
+ elif type(x) is list:
28
+ return {f"{name}list": str(x)}
29
+ else: return {name[:-1]: x}
30
+
31
+ # Normalize to DataFrame
32
+ if isinstance(data, list):
33
+ return pd.DataFrame([flatten(x) for x in data])
34
+
35
+ return pd.DataFrame([flatten(data)])
36
+
37
+ except Exception as e:
38
+ print(f"❌ JSON Read Error: {e}")
39
+ return pd.DataFrame()