diff --git a/__pycache__/avro_handler.cpython-313.pyc b/__pycache__/avro_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf67fe6b7d7c8a7ae43d146c339c49a07888cc31 Binary files /dev/null and b/__pycache__/avro_handler.cpython-313.pyc differ diff --git a/__pycache__/backend.cpython-313.pyc b/__pycache__/backend.cpython-313.pyc index 1e3e92b97fec90689c864b4b11352e1b7dcaefb5..7ffacef767f3f24dc9fa3eacb3ac465420c2ffb0 100644 Binary files a/__pycache__/backend.cpython-313.pyc and b/__pycache__/backend.cpython-313.pyc differ diff --git a/__pycache__/gliner_model.cpython-313.pyc b/__pycache__/gliner_model.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..351c2954d944308738f8d0d6bff180b09f729713 Binary files /dev/null and b/__pycache__/gliner_model.cpython-313.pyc differ diff --git a/__pycache__/inspector.cpython-313.pyc b/__pycache__/inspector.cpython-313.pyc index 46d94e65422cc69f0bbb2fa76bf065329106e604..934d2e918175606a797f8baff75f8eca0753aab0 100644 Binary files a/__pycache__/inspector.cpython-313.pyc and b/__pycache__/inspector.cpython-313.pyc differ diff --git a/__pycache__/ocr_engine.cpython-313.pyc b/__pycache__/ocr_engine.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57bcf8029888d5bf51aa9cece684a315e195ac51 Binary files /dev/null and b/__pycache__/ocr_engine.cpython-313.pyc differ diff --git a/api.py b/api.py new file mode 100644 index 0000000000000000000000000000000000000000..dd3acf24523d37e3d0963d1695cca0f4728cc2e2 --- /dev/null +++ b/api.py @@ -0,0 +1,143 @@ +# api.py +from fastapi import FastAPI, UploadFile, File, Form, HTTPException +from pydantic import BaseModel +from typing import Optional, List +import pandas as pd +import io +import json + +# Import your existing backend orchestrator +from core.backend import RegexClassifier + +app = FastAPI(title="Segmento Sense API") + +# Initialize the Brain +backend = RegexClassifier() + +# --- Pydantic Models for Requests --- +class DbConnection(BaseModel): + type: str # postgres, mysql, mongo + host: str + port: str + user: str + password: str + database: str + collection: Optional[str] = None + +class CloudConnection(BaseModel): + service: str # aws, azure, gcp + key_1: str # access_key or conn_string + key_2: Optional[str] = None # secret_key + region: Optional[str] = None + bucket: str + file_name: str + +class AppConnection(BaseModel): + service: str # gmail, slack, confluence + token_or_path: str # token or credentials.json content + target: str # channel_id, page_id, or num_emails + +# --- ENDPOINTS --- + +@app.get("/") +def health_check(): + return {"status": "Segmento Sense is running"} + +@app.post("/scan/file") +async def scan_file(file: UploadFile = File(...)): + """ + Handles PDF, CSV, JSON, Parquet, Avro, Image uploads. + """ + file_bytes = await file.read() + filename = file.filename.lower() + + df = pd.DataFrame() + raw_text = "" + + # 1. Route to correct handler in backend.py + if filename.endswith(".pdf"): + # For demo, scan page 0 + raw_text = backend.get_pdf_page_text(file_bytes, 0) + # Scan text + inspection = backend.run_full_inspection(raw_text) + matches = backend.analyze_text_hybrid(raw_text) + return { + "type": "unstructured", + "content": raw_text, + "matches": matches, + "stats": inspection.to_dict(orient="records") + } + + elif filename.endswith((".png", ".jpg", ".jpeg")): + raw_text = backend.get_ocr_text_from_image(file_bytes) + inspection = backend.run_full_inspection(raw_text) + matches = backend.analyze_text_hybrid(raw_text) + return { + "type": "unstructured", + "content": raw_text, + "matches": matches, + "stats": inspection.to_dict(orient="records") + } + + else: + # Structured Data + if filename.endswith(".csv"): + df = pd.read_csv(io.BytesIO(file_bytes)) + elif filename.endswith(".json"): + df = backend.get_json_data(io.BytesIO(file_bytes)) + elif filename.endswith(".parquet"): + df = backend.get_parquet_data(file_bytes) + elif filename.endswith(".avro"): + df = backend.get_avro_data(file_bytes) + + # Get PII Counts + pii_counts = backend.get_pii_counts_dataframe(df) + masked_preview = backend.mask_dataframe(df.head(20)) + + return { + "type": "structured", + "pii_counts": pii_counts.to_dict(orient="records"), + "preview": masked_preview.to_dict(orient="records"), + "schema": backend.get_data_schema(df).to_dict(orient="records") + } + +@app.post("/scan/database") +async def scan_db(conn: DbConnection): + df = pd.DataFrame() + if conn.type == "postgres": + df = backend.get_postgres_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection) + elif conn.type == "mysql": + df = backend.get_mysql_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection) + elif conn.type == "mongo": + df = backend.get_mongodb_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection) + + if df.empty: + raise HTTPException(status_code=404, detail="Connection failed or no data found") + + pii_counts = backend.get_pii_counts_dataframe(df) + return { + "source": conn.type, + "pii_counts": pii_counts.to_dict(orient="records"), + "preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records") + } + +@app.post("/scan/app") +async def scan_app(conn: AppConnection): + df = pd.DataFrame() + + if conn.service == "slack": + df = backend.get_slack_messages(conn.token_or_path, conn.target) + elif conn.service == "confluence": + # Split target "url|user|page_id" if needed or adjust model + # Simplified for demo: assuming backend handles auth + pass + + if df.empty: + raise HTTPException(status_code=400, detail="No data fetched") + + pii_counts = backend.get_pii_counts_dataframe(df) + return { + "source": conn.service, + "pii_counts": pii_counts.to_dict(orient="records"), + "preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records") + } \ No newline at end of file diff --git a/backend.py b/backend.py index 42f508af3ed633e27079b6a82607ed9a653a88e5..19c2776185ca84ceee9de9b3dee63417b577a7bc 100644 --- a/backend.py +++ b/backend.py @@ -1,68 +1,66 @@ -# backend.py import re import json import pandas as pd import fitz # PyMuPDF import nltk import io +import os +import pickle +import base64 from typing import Dict, List, Any from sqlalchemy import create_engine from urllib.parse import quote_plus - -# --- IMPORT MODULES --- -from spacy_model import PiiSpacyAnalyzer -from presidio_model import PiiPresidioAnalyzer -from inspector import ModelInspector +from bs4 import BeautifulSoup + +# --- IMPORT CLASSIFIERS --- +from classifier_manager.spacy_model import PiiSpacyAnalyzer +from classifier_manager.presidio_model import PiiPresidioAnalyzer +from classifier_manager.gliner_model import PiiGlinerAnalyzer +from classifier_manager.inspector import ModelInspector + +# --- IMPORT FILE HANDLERS --- +from file_handlers.ocr_engine import OcrEngine +from file_handlers.avro_handler import AvroHandler +from file_handlers.parquet_handler import ParquetHandler +from file_handlers.json_handler import JsonHandler +from file_handlers.pdf_handler import PdfHandler + +# --- IMPORT CONNECTORS --- +from connectors.postgres_handler import PostgresHandler +from connectors.mysql_handler import MysqlHandler +from connectors.gmail_handler import GmailHandler +from connectors.drive_handler import DriveHandler +from connectors.aws_s3_handler import S3Handler +from connectors.azure_handler import AzureBlobHandler +from connectors.gcp_storage_handler import GcpStorageHandler +from connectors.slack_handler import SlackHandler # <--- NEW +from connectors.confluence_handler import ConfluenceHandler # <--- NEW # --- DEPENDENCY CHECKS --- try: from googleapiclient.discovery import build - from googleapiclient.http import MediaIoBaseDownload - from google.oauth2 import service_account GOOGLE_AVAILABLE = True except ImportError: GOOGLE_AVAILABLE = False - print("Google Drive Libraries not installed.") - + print("Google Libraries not installed.") try: import pymongo MONGO_AVAILABLE = True -except ImportError: - MONGO_AVAILABLE = False - print("PyMongo not installed.") - -try: - import pyarrow - PARQUET_AVAILABLE = True -except ImportError: - PARQUET_AVAILABLE = False - print("PyArrow not installed.") - +except: MONGO_AVAILABLE = False try: import boto3 AWS_AVAILABLE = True -except ImportError: - AWS_AVAILABLE = False - print("Boto3 not installed.") - +except: AWS_AVAILABLE = False try: from azure.storage.blob import BlobServiceClient AZURE_AVAILABLE = True -except ImportError: - AZURE_AVAILABLE = False - print("Azure Storage Blob not installed.") - -# --- GCP STORAGE IMPORT (NEW) --- +except: AZURE_AVAILABLE = False try: from google.cloud import storage - # We reuse google.oauth2.service_account if available, else import it - from google.oauth2 import service_account as gcp_service_account GCS_AVAILABLE = True -except ImportError: - GCS_AVAILABLE = False - print("Google Cloud Storage library not installed.") +except: GCS_AVAILABLE = False -# --- NLTK SETUP --- +# NLTK Setup try: nltk.data.find('tokenizers/punkt') except LookupError: @@ -75,10 +73,9 @@ except LookupError: class RegexClassifier: def __init__(self): self.colors = { - "EMAIL": (136, 238, 255), "FIRST_NAME": (170, 255, 170), "LAST_NAME": (170, 255, 170), - "PHONE": (255, 170, 170), "SSN": (255, 204, 170), "CREDIT_CARD": (255, 238, 170), - "LOCATION": (200, 170, 255), "AADHAAR_IND": (255, 150, 255), "ORG": (255, 255, 150), - "DEFAULT": (224, 224, 224) + "EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9", + "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea", + "LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0" } self.patterns: Dict[str, str] = { @@ -90,69 +87,121 @@ class RegexClassifier: "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b", } + # 1. Classifiers self.spacy_analyzer = PiiSpacyAnalyzer() self.presidio_analyzer = PiiPresidioAnalyzer() + self.gliner_analyzer = PiiGlinerAnalyzer() self.inspector = ModelInspector() + + # 2. File Handlers + self.ocr_engine = OcrEngine() + self.avro_handler = AvroHandler() + self.parquet_handler = ParquetHandler() + self.json_handler = JsonHandler() + self.pdf_handler = PdfHandler(self.ocr_engine) + + # 3. Connectors + self.pg_handler = PostgresHandler() + self.mysql_handler = MysqlHandler() + self.gmail_handler = GmailHandler() + self.drive_handler = DriveHandler() + self.s3_handler = S3Handler() + self.azure_handler = AzureBlobHandler() + self.gcp_handler = GcpStorageHandler() + self.slack_handler = SlackHandler() # <--- Init + self.confluence_handler = ConfluenceHandler() # <--- Init def list_patterns(self): return self.patterns def add_pattern(self, n, r): self.patterns[n.upper()] = r def remove_pattern(self, n): self.patterns.pop(n.upper(), None) - # --- DETECTION ENGINES --- + # --- CORE ANALYSIS --- def scan_with_regex(self, text: str) -> List[dict]: matches = [] for label, regex in self.patterns.items(): - for match in re.finditer(regex, text): - matches.append({"label": label, "text": match.group(), "start": match.start(), "end": match.end()}) + for m in re.finditer(regex, text): + matches.append({"label": label, "text": m.group(), "start": m.start(), "end": m.end(), "source": "Regex"}) return matches def scan_with_nltk(self, text: str) -> List[dict]: detections = [] try: - tokens = nltk.word_tokenize(text) - chunked = nltk.ne_chunk(nltk.pos_tag(tokens), binary=False) - current_pos = 0 - for chunk in chunked: + for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))): if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'GPE']: val = " ".join(c[0] for c in chunk) - start_idx = text.find(val, current_pos) - label = "LOCATION" if chunk.label() == 'GPE' else "FIRST_NAME" - if start_idx != -1: - detections.append({"label": label, "text": val, "start": start_idx, "end": start_idx + len(val)}) - current_pos = start_idx + len(val) + start = text.find(val) + if start != -1: + detections.append({ + "label": "LOCATION" if chunk.label() == 'GPE' else "FIRST_NAME", + "text": val, "start": start, "end": start+len(val), "source": "NLTK" + }) except: pass return detections def analyze_text_hybrid(self, text: str) -> List[dict]: + if not text: return [] all_matches = [] all_matches.extend(self.scan_with_regex(text)) all_matches.extend(self.scan_with_nltk(text)) all_matches.extend(self.spacy_analyzer.scan(text)) all_matches.extend(self.presidio_analyzer.scan(text)) + all_matches.extend(self.gliner_analyzer.scan(text)) all_matches.sort(key=lambda x: x['start']) - - unique_matches = [] + unique = [] if not all_matches: return [] curr = all_matches[0] - for next_match in all_matches[1:]: - if next_match['start'] < curr['end']: - if len(next_match['text']) > len(curr['text']): - curr = next_match + for next_m in all_matches[1:]: + if next_m['start'] < curr['end']: + if len(next_m['text']) > len(curr['text']): + curr = next_m else: - unique_matches.append(curr) - curr = next_match - unique_matches.append(curr) - return unique_matches - - def run_full_inspection(self, text: str) -> pd.DataFrame: - r_matches = self.scan_with_regex(text) - n_matches = self.scan_with_nltk(text) - s_matches = self.spacy_analyzer.scan(text) - p_matches = self.presidio_analyzer.scan(text) - return self.inspector.compare_models(r_matches, n_matches, s_matches, p_matches) - - # --- SUMMARY & VISUALS --- + unique.append(curr) + curr = next_m + unique.append(curr) + return unique + + def run_full_inspection(self, text: str): + return self.inspector.compare_models( + self.scan_with_regex(text), + self.scan_with_nltk(text), + self.spacy_analyzer.scan(text), + self.presidio_analyzer.scan(text), + self.gliner_analyzer.scan(text) + ) + + # --- WRAPPERS FOR UI --- + def get_json_data(self, file_obj) -> pd.DataFrame: + return self.json_handler.read_file(file_obj) + + def get_pdf_page_text(self, file_bytes, page_num): + return self.pdf_handler.get_page_text(file_bytes, page_num) + + def get_pdf_total_pages(self, file_bytes) -> int: + return self.pdf_handler.get_total_pages(file_bytes) + + def get_labeled_pdf_image(self, file_bytes, page_num): + text = self.get_pdf_page_text(file_bytes, page_num) + matches = self.analyze_text_hybrid(text) + return self.pdf_handler.render_labeled_image(file_bytes, page_num, matches, self.colors) + + def get_avro_data(self, file_bytes) -> pd.DataFrame: + return self.avro_handler.convert_to_dataframe(file_bytes) + + def get_parquet_data(self, file_bytes) -> pd.DataFrame: + return self.parquet_handler.convert_to_dataframe(file_bytes) + + def get_ocr_text_from_image(self, file_bytes) -> str: + return self.ocr_engine.extract_text(file_bytes) + + def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: + text = " ".join(df.astype(str).values.flatten()) + matches = self.analyze_text_hybrid(str(text)) + if not matches: return pd.DataFrame(columns=["PII Type", "Count"]) + counts = {} + for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1 + return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"]) + def get_pii_counts(self, text: str) -> pd.DataFrame: matches = self.analyze_text_hybrid(str(text)) if not matches: return pd.DataFrame(columns=["PII Type", "Count"]) @@ -160,261 +209,78 @@ class RegexClassifier: for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1 return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"]) - def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: - full_text = " ".join(df.astype(str).values.flatten()) - return self.get_pii_counts(full_text) - - def mask_pii(self, text: str) -> str: - text = str(text) - matches = self.analyze_text_hybrid(text) - matches.sort(key=lambda x: x['start'], reverse=True) - for m in matches: - masked_val = "******" - if " pd.DataFrame: - def safe_mask(val): - if isinstance(val, (list, dict, tuple, set)): return self.mask_pii(str(val)) - if pd.isna(val): return val - return self.mask_pii(str(val)) - return df.map(safe_mask) - - def get_labeled_pdf_image(self, file_bytes, page_num: int): - try: - doc = fitz.open(stream=file_bytes, filetype="pdf") - if not (0 <= page_num < len(doc)): return None - page = doc[page_num] - text = page.get_text("text") + def mask_text(text): + text = str(text) matches = self.analyze_text_hybrid(text) + matches.sort(key=lambda x: x['start'], reverse=True) for m in matches: - color_norm = tuple(c/255 for c in self.colors.get(m['label'], self.colors["DEFAULT"])) - quads = page.search_for(m['text']) - for quad in quads: - page.draw_rect(quad, color=color_norm, fill=color_norm, fill_opacity=0.4) - page.insert_text(fitz.Point(quad.x0, quad.y0-2), m['label'], fontsize=6, color=(0,0,0)) - return page.get_pixmap(matrix=fitz.Matrix(2, 2)).tobytes("png") - except: return None + if "***" not in text[m['start']:m['end']]: + text = text[:m['start']] + "******" + text[m['end']:] + return text + return df.map(lambda x: mask_text(x) if isinstance(x, (str, int, float)) else x) def scan_dataframe_with_html(self, df: pd.DataFrame) -> pd.DataFrame: - def highlight_html(text): + def highlight(text): text = str(text) matches = self.analyze_text_hybrid(text) matches.sort(key=lambda x: x['start'], reverse=True) - hex_map = {"EMAIL": "#8ef", "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea", "FIRST_NAME": "#af9", "LAST_NAME": "#af9", "LOCATION": "#dcf", "AADHAAR_IND": "#f9f", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"} for m in matches: if "{m["text"]}' - text = text[:m['start']] + tag + text[m['end']:] + color = self.colors.get(m['label'], self.colors["DEFAULT"]) + replacement = f'{m["text"]}' + text = text[:m['start']] + replacement + text[m['end']:] return text - def safe_highlight(val): - if isinstance(val, (list, dict)): return highlight_html(str(val)) - if pd.isna(val): return val - return highlight_html(val) - return df.map(safe_highlight) - - def get_data_schema(self, df: pd.DataFrame) -> pd.DataFrame: - if df.empty: return pd.DataFrame(columns=["Column", "Type", "Sample"]) - schema_info = [] - for col in df.columns: - d_type = str(df[col].dtype) - first_valid_idx = df[col].first_valid_index() - sample_val = str(df[col].loc[first_valid_idx]) if first_valid_idx is not None else "All Null" - if len(sample_val) > 50: sample_val = sample_val[:47] + "..." - schema_info.append({"Column Name": col, "Data Type": d_type, "Sample Value": sample_val}) - return pd.DataFrame(schema_info) - - # --- SQL/MONGO/DRIVE/S3/AZURE CONNECTORS --- + return df.map(lambda x: highlight(x) if isinstance(x, str) else x) + + def get_data_schema(self, df): + return pd.DataFrame({"Column": df.columns, "Type": df.dtypes.astype(str)}) + + # --- CONNECTOR WRAPPERS --- def get_postgres_data(self, host, port, db, user, pw, table): - safe_pw = quote_plus(pw) - conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}" - engine = create_engine(conn_str) - return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine) + return self.pg_handler.fetch_data(host, port, db, user, pw, table) def get_mysql_data(self, host, port, db, user, pw, table): - safe_pw = quote_plus(pw) - conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}" - engine = create_engine(conn_str) - return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine) + return self.mysql_handler.fetch_data(host, port, db, user, pw, table) - def get_mongodb_data(self, host, port, db, user, pw, collection): - if not MONGO_AVAILABLE: return pd.DataFrame() - try: - if user and pw: - safe_user = quote_plus(user) - safe_pw = quote_plus(pw) - uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/" - else: - uri = f"mongodb://{host}:{port}/" - client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000) - database = client[db] - col = database[collection] - cursor = col.find().limit(100) - data_list = list(cursor) - if not data_list: return pd.DataFrame() - for doc in data_list: - if '_id' in doc: doc['_id'] = str(doc['_id']) - return pd.json_normalize(data_list) - except Exception as e: - print(f"Mongo Error: {e}") - raise e + def get_gmail_data(self, credentials_file, num_emails=10) -> pd.DataFrame: + return self.gmail_handler.fetch_emails(credentials_file, num_emails) def get_google_drive_files(self, credentials_dict): - if not GOOGLE_AVAILABLE: return [] - try: - SCOPES = ['https://www.googleapis.com/auth/drive.readonly'] - creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES) - service = build('drive', 'v3', credentials=creds) - return service.files().list(pageSize=15, fields="files(id, name, mimeType)").execute().get('files', []) - except Exception as e: - return [] + return self.drive_handler.list_files(credentials_dict) def download_drive_file(self, file_id, mime_type, credentials_dict): - if not GOOGLE_AVAILABLE: return b"" - try: - SCOPES = ['https://www.googleapis.com/auth/drive.readonly'] - creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES) - service = build('drive', 'v3', credentials=creds) - if "spreadsheet" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='text/csv') - elif "document" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf') - elif "presentation" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf') - else: request = service.files().get_media(fileId=file_id) - fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request) - done = False - while done is False: status, done = downloader.next_chunk() - return fh.getvalue() - except: return b"" - - def get_s3_buckets(self, access_key, secret_key, region): - if not AWS_AVAILABLE: return [] - try: - s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) - response = s3.list_buckets() - return [b['Name'] for b in response.get('Buckets', [])] - except Exception as e: - print(f"S3 Error: {e}") - return [] - - def get_s3_files(self, access_key, secret_key, region, bucket_name): - if not AWS_AVAILABLE: return [] - try: - s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) - response = s3.list_objects_v2(Bucket=bucket_name) - return [obj['Key'] for obj in response.get('Contents', [])] - except Exception as e: - return [] - - def download_s3_file(self, access_key, secret_key, region, bucket_name, file_key): - if not AWS_AVAILABLE: return b"" - try: - s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) - obj = s3.get_object(Bucket=bucket_name, Key=file_key) - return obj['Body'].read() - except Exception as e: - return b"" - - def get_azure_containers(self, conn_str): - if not AZURE_AVAILABLE: return [] - try: - blob_service_client = BlobServiceClient.from_connection_string(conn_str) - containers = blob_service_client.list_containers() - return [c['name'] for c in containers] - except Exception as e: - print(f"Azure Error: {e}") - return [] - - def get_azure_blobs(self, conn_str, container_name): - if not AZURE_AVAILABLE: return [] - try: - blob_service_client = BlobServiceClient.from_connection_string(conn_str) - container_client = blob_service_client.get_container_client(container_name) - blobs = container_client.list_blobs() - return [b['name'] for b in blobs] - except Exception as e: - return [] - - def download_azure_blob(self, conn_str, container_name, blob_name): - if not AZURE_AVAILABLE: return b"" - try: - blob_service_client = BlobServiceClient.from_connection_string(conn_str) - blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name) - return blob_client.download_blob().readall() - except Exception as e: - return b"" - - # --- GCP BUCKET CONNECTORS (NEW) --- - def get_gcs_buckets(self, credentials_dict): - """Lists all GCS buckets for the given service account credentials.""" - if not GCS_AVAILABLE: return [] - try: - # Create credentials object - credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict) - # Create storage client - storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id')) - - buckets = storage_client.list_buckets() - return [bucket.name for bucket in buckets] - except Exception as e: - print(f"GCP Bucket Error: {e}") - return [] - - def get_gcs_files(self, credentials_dict, bucket_name): - """Lists files (blobs) in a specific GCS bucket.""" - if not GCS_AVAILABLE: return [] - try: - credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict) - storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id')) - - blobs = storage_client.list_blobs(bucket_name) - return [blob.name for blob in blobs] - except Exception as e: - print(f"GCP List Error: {e}") - return [] - - def download_gcs_file(self, credentials_dict, bucket_name, blob_name): - """Downloads a blob from GCS to memory.""" - if not GCS_AVAILABLE: return b"" - try: - credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict) - storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id')) - - bucket = storage_client.bucket(bucket_name) - blob = bucket.blob(blob_name) - return blob.download_as_bytes() - except Exception as e: - print(f"GCP Download Error: {e}") - return b"" - - # --- FILE READERS --- - def get_json_data(self, file_obj) -> pd.DataFrame: - data = json.load(file_obj) - flat = [] - def recursive(d, path): - if isinstance(d, dict): - for k, v in d.items(): recursive(v, f"{path}.{k}" if path else k) - elif isinstance(d, list): - for i, v in enumerate(d): recursive(v, f"{path}[{i}]") - else: flat.append({"Path": path, "Value": str(d)}) - recursive(data, "") - return pd.DataFrame(flat) + return self.drive_handler.download_file(file_id, mime_type, credentials_dict) - def get_parquet_data(self, file_bytes) -> pd.DataFrame: - if not PARQUET_AVAILABLE: return pd.DataFrame() - try: - return pd.read_parquet(io.BytesIO(file_bytes)) - except: return pd.DataFrame() - - def get_pdf_total_pages(self, file_bytes) -> int: - try: - doc = fitz.open(stream=file_bytes, filetype="pdf") - return len(doc) - except: return 0 + def get_s3_buckets(self, a, s, r): return self.s3_handler.get_buckets(a, s, r) + def get_s3_files(self, a, s, r, b): return self.s3_handler.get_files(a, s, r, b) + def download_s3_file(self, a, s, r, b, k): return self.s3_handler.download_file(a, s, r, b, k) - def get_pdf_page_text(self, file_bytes, page_num): + def get_azure_containers(self, c): return self.azure_handler.get_containers(c) + def get_azure_blobs(self, c, n): return self.azure_handler.get_blobs(c, n) + def download_azure_blob(self, c, n, b): return self.azure_handler.download_blob(c, n, b) + + def get_gcs_buckets(self, c): return self.gcp_handler.get_buckets(c) + def get_gcs_files(self, c, b): return self.gcp_handler.get_files(c, b) + def download_gcs_file(self, c, b, n): return self.gcp_handler.download_file(c, b, n) + + # --- NEW WRAPPERS FOR SLACK & CONFLUENCE --- + def get_slack_messages(self, token, channel_id): + return self.slack_handler.fetch_messages(token, channel_id) + + def get_confluence_page(self, url, username, token, page_id): + return self.confluence_handler.fetch_page_content(url, username, token, page_id) + + # --- MONGO (Still here) --- + def get_mongodb_data(self, host, port, db, user, pw, collection): + if not MONGO_AVAILABLE: return pd.DataFrame() try: - doc = fitz.open(stream=file_bytes, filetype="pdf") - return doc[page_num].get_text("text") - except: return "" \ No newline at end of file + if user and pw: uri = f"mongodb://{quote_plus(user)}:{quote_plus(pw)}@{host}:{port}/" + else: uri = f"mongodb://{host}:{port}/" + client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000) + cursor = client[db][collection].find().limit(100) + data = list(cursor) + if not data: return pd.DataFrame() + for d in data: d['_id'] = str(d.get('_id', '')) + return pd.json_normalize(data) + except: return pd.DataFrame() \ No newline at end of file diff --git a/new_spacy b/classifier_manager/__init__.py similarity index 100% rename from new_spacy rename to classifier_manager/__init__.py diff --git a/classifier_manager/__pycache__/__init__.cpython-313.pyc b/classifier_manager/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37c0ce4789cf3dfea9f767888bb820b087228630 Binary files /dev/null and b/classifier_manager/__pycache__/__init__.cpython-313.pyc differ diff --git a/classifier_manager/__pycache__/gliner_model.cpython-313.pyc b/classifier_manager/__pycache__/gliner_model.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a8c5072eaa0156456a7f30abffd4c31a126af10 Binary files /dev/null and b/classifier_manager/__pycache__/gliner_model.cpython-313.pyc differ diff --git a/classifier_manager/__pycache__/inspector.cpython-313.pyc b/classifier_manager/__pycache__/inspector.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..227e751b4b1108087d773a9c3d3cada3614be3db Binary files /dev/null and b/classifier_manager/__pycache__/inspector.cpython-313.pyc differ diff --git a/classifier_manager/__pycache__/presidio_model.cpython-313.pyc b/classifier_manager/__pycache__/presidio_model.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8cd78453b75b13684ac166555c0eb0b23bc7e90 Binary files /dev/null and b/classifier_manager/__pycache__/presidio_model.cpython-313.pyc differ diff --git a/classifier_manager/__pycache__/regex_scanner.cpython-313.pyc b/classifier_manager/__pycache__/regex_scanner.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..798f53f8542f061dc7a4e874ea5c7f6938ea34af Binary files /dev/null and b/classifier_manager/__pycache__/regex_scanner.cpython-313.pyc differ diff --git a/classifier_manager/__pycache__/spacy_model.cpython-313.pyc b/classifier_manager/__pycache__/spacy_model.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..199c45a45d929bca3dcc62ebd48dd9c54324841c Binary files /dev/null and b/classifier_manager/__pycache__/spacy_model.cpython-313.pyc differ diff --git a/classifier_manager/gliner_model.py b/classifier_manager/gliner_model.py new file mode 100644 index 0000000000000000000000000000000000000000..f1b619f9fa73ebe1071b66689174f87670ac0617 --- /dev/null +++ b/classifier_manager/gliner_model.py @@ -0,0 +1,81 @@ +from gliner import GLiNER + +class PiiGlinerAnalyzer: + def __init__(self, model_name="urchade/gliner_small-v2.1"): + """ + Initializes the GLiNER model. + Uses a small, efficient BERT-based model by default. + """ + self.model = None + self.available = False + + # Define the natural language labels you want GLiNER to look for. + # These are used as prompts for the model. + self.labels = [ + "person", + "email", + "phone number", + "credit card", + "social security number", + "organization", + "location", + "date", + "ip address", + "passport number", + "driver license" + ] + + try: + print(f"⏳ Loading GLiNER model: {model_name}...") + # This will download the model to your local cache on the first run + self.model = GLiNER.from_pretrained(model_name) + self.available = True + print("✅ GLiNER model loaded successfully.") + except Exception as e: + print(f"❌ Error loading GLiNER: {e}") + + def scan(self, text: str) -> list: + """ + Scans text using GLiNER and normalizes the output for the Inspector. + """ + if not self.available or not text or not text.strip(): + return [] + + try: + # GLiNER takes text and a list of labels as input + # Threshold 0.5 is a good balance for the small model + entities = self.model.predict_entities(text, self.labels, threshold=0.5) + + detections = [] + + # Map GLiNER's lowercase output labels to your App's standard uppercase keys + # to ensure consistency in the UI and Inspector. + label_map = { + "person": "FIRST_NAME", + "phone number": "PHONE", + "social security number": "SSN", + "organization": "ORG", + "location": "LOCATION", + "ip address": "IP_ADDRESS", + "credit card": "CREDIT_CARD", + "email": "EMAIL", + "date": "DATE_TIME", + "passport number": "PASSPORT", + "driver license": "DRIVER_LICENSE" + } + + for ent in entities: + detections.append({ + "label": label_map.get(ent["label"], ent["label"].upper().replace(" ", "_")), + "text": ent["text"], + "start": ent["start"], + "end": ent["end"], + "score": ent["score"], + "source": "GLiNER" # Helpful metadata + }) + + return detections + + except Exception as e: + print(f"⚠️ GLiNER Scan Error: {e}") + return [] \ No newline at end of file diff --git a/inspector.py b/classifier_manager/inspector.py similarity index 68% rename from inspector.py rename to classifier_manager/inspector.py index 94aedf65de9dc7a987ddf9cb3dde195be5155c14..bb92462bb0aaaf0fae8bc540ed08de35830fb0d5 100644 --- a/inspector.py +++ b/classifier_manager/inspector.py @@ -12,16 +12,17 @@ class ModelInspector: "end": match["end"] } - def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches): + def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches, gliner_matches): """ - Compares 4 lists of matches to find Unique vs Missed PII. + Compares 5 lists of matches to find Unique vs Missed PII. + Added GLiNER to the comparison logic. """ all_detections = {} def add_to_master(matches, model_name): found_set = set() for m in matches: - # Use tuple key for uniqueness + # Use tuple key for uniqueness: (start, end, text) key = (m['start'], m['end'], m['text']) if key not in all_detections: all_detections[key] = {'text': m['text'], 'label': m['label']} @@ -32,19 +33,26 @@ class ModelInspector: regex_set = add_to_master(regex_matches, "Regex") nltk_set = add_to_master(nltk_matches, "NLTK") spacy_set = add_to_master(spacy_matches, "SpaCy") - presidio_set = add_to_master(presidio_matches, "Presidio") # <--- Added Presidio + presidio_set = add_to_master(presidio_matches, "Presidio") + gliner_set = add_to_master(gliner_matches, "GLiNER") # <--- Added GLiNER - # 2. Calculate "Missed" Data + # 2. Calculate "Missed" Data (Union of all models) total_unique_pii = set(all_detections.keys()) regex_missed = total_unique_pii - regex_set nltk_missed = total_unique_pii - nltk_set spacy_missed = total_unique_pii - spacy_set - presidio_missed = total_unique_pii - presidio_set # <--- Added Presidio + presidio_missed = total_unique_pii - presidio_set + gliner_missed = total_unique_pii - gliner_set # <--- Added GLiNER def fmt(item_set): items = [all_detections[k]['text'] for k in item_set] - return ", ".join(items) if items else "None" + # Limiting to first 5 items to prevent UI clutter if list is huge + display_items = items[:5] + res = ", ".join(display_items) + if len(items) > 5: + res += f", (+{len(items)-5} more)" + return res if res else "None" total_count = len(total_unique_pii) if len(total_unique_pii) > 0 else 1 @@ -76,7 +84,15 @@ class ModelInspector: "Missed PII": fmt(presidio_missed), "Accuracy": len(presidio_set) / total_count, "Count": len(presidio_set) + }, + { + "Model": "🦅 GLiNER", + "Detected PII": fmt(gliner_set), + "Missed PII": fmt(gliner_missed), + "Accuracy": len(gliner_set) / total_count, + "Count": len(gliner_set) } ] - return pd.DataFrame(stats) \ No newline at end of file + # Return sorted by Accuracy descending so best model is on top + return pd.DataFrame(stats).sort_values(by="Accuracy", ascending=False) \ No newline at end of file diff --git a/presidio_model.py b/classifier_manager/presidio_model.py similarity index 100% rename from presidio_model.py rename to classifier_manager/presidio_model.py diff --git a/classifier_manager/regex_scanner.py b/classifier_manager/regex_scanner.py new file mode 100644 index 0000000000000000000000000000000000000000..29c57051f728c27f28591a28c3859f69fd021c18 --- /dev/null +++ b/classifier_manager/regex_scanner.py @@ -0,0 +1,44 @@ +import re +from typing import Dict, List + +class RegexScanner: + def __init__(self): + self.colors = { + "EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9", + "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea", + "LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0" + } + + self.patterns: Dict[str, str] = { + "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", + "PHONE": r"\b(?:\+?1[-. ]?)?\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})\b", + "SSN": r"\b\d{3}-\d{2}-\d{4}\b", + "CREDIT_CARD": r"\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b", + "AADHAAR_IND": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b", + "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b", + } + + def add_pattern(self, name, regex): + self.patterns[name.upper()] = regex + + def remove_pattern(self, name): + self.patterns.pop(name.upper(), None) + + def scan(self, text: str) -> List[dict]: + """ + Scans text using defined Regex patterns. + """ + matches = [] + for label, regex in self.patterns.items(): + try: + for m in re.finditer(regex, text): + matches.append({ + "label": label, + "text": m.group(), + "start": m.start(), + "end": m.end(), + "source": "Regex" + }) + except re.error: + continue # Skip invalid user-defined regex + return matches \ No newline at end of file diff --git a/Spacy_model.py b/classifier_manager/spacy_model.py similarity index 100% rename from Spacy_model.py rename to classifier_manager/spacy_model.py diff --git a/connectors/__init__.py b/connectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/connectors/__pycache__/__init__.cpython-313.pyc b/connectors/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b38b9e7c94485879719cf71a01398327eefa3bfe Binary files /dev/null and b/connectors/__pycache__/__init__.cpython-313.pyc differ diff --git a/connectors/__pycache__/aws_s3_handler.cpython-313.pyc b/connectors/__pycache__/aws_s3_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a0ba91b85fd4225efb93b038d63c0c7ab686711 Binary files /dev/null and b/connectors/__pycache__/aws_s3_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/azure_handler.cpython-313.pyc b/connectors/__pycache__/azure_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46b29c428a5a7a887659a308d8812500634222bb Binary files /dev/null and b/connectors/__pycache__/azure_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/confluence_handler.cpython-313.pyc b/connectors/__pycache__/confluence_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8664b74053e6e241cf74b56fb78906a7e26a7b66 Binary files /dev/null and b/connectors/__pycache__/confluence_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/drive_handler.cpython-313.pyc b/connectors/__pycache__/drive_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8e1e416719275f8ade5a73ad44037bf9ab0643c Binary files /dev/null and b/connectors/__pycache__/drive_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/gcp_storage_handler.cpython-313.pyc b/connectors/__pycache__/gcp_storage_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71cbe4098902a2813f48eedf0331abc2a97d67e7 Binary files /dev/null and b/connectors/__pycache__/gcp_storage_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/gmail_handler.cpython-313.pyc b/connectors/__pycache__/gmail_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bd9a866bb052a5868def12211796844d47e8e57 Binary files /dev/null and b/connectors/__pycache__/gmail_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/mongo_handler.cpython-313.pyc b/connectors/__pycache__/mongo_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a33fc41fbb0cfd3e3d280a7b1281f8542fbf37b Binary files /dev/null and b/connectors/__pycache__/mongo_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/mysql_handler.cpython-313.pyc b/connectors/__pycache__/mysql_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64ab13b7cea1fd487bca207d3dfadf64795987de Binary files /dev/null and b/connectors/__pycache__/mysql_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/postgres_handler.cpython-313.pyc b/connectors/__pycache__/postgres_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..782d07d168ae050abbb0ecca5bb7d2b41946ae11 Binary files /dev/null and b/connectors/__pycache__/postgres_handler.cpython-313.pyc differ diff --git a/connectors/__pycache__/slack_handler.cpython-313.pyc b/connectors/__pycache__/slack_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72728236c1cd9918b49338adc2c876b5b53a1c5d Binary files /dev/null and b/connectors/__pycache__/slack_handler.cpython-313.pyc differ diff --git a/connectors/aws_s3_handler.py b/connectors/aws_s3_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..4134a4461f31aad00bf88fb54c512713a10338d6 --- /dev/null +++ b/connectors/aws_s3_handler.py @@ -0,0 +1,32 @@ +import boto3 +import io + +class S3Handler: + def __init__(self): + print("✅ AWS S3 Handler loaded.") + + def get_buckets(self, access_key, secret_key, region): + try: + s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) + response = s3.list_buckets() + return [b['Name'] for b in response.get('Buckets', [])] + except Exception as e: + print(f"❌ S3 Error: {e}") + return [] + + def get_files(self, access_key, secret_key, region, bucket_name): + try: + s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) + response = s3.list_objects_v2(Bucket=bucket_name) + return [obj['Key'] for obj in response.get('Contents', [])] + except Exception as e: + return [] + + def download_file(self, access_key, secret_key, region, bucket_name, file_key): + try: + s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) + obj = s3.get_object(Bucket=bucket_name, Key=file_key) + return obj['Body'].read() + except Exception as e: + print(f"❌ S3 Download Error: {e}") + return b"" \ No newline at end of file diff --git a/connectors/azure_handler.py b/connectors/azure_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..f164be2651a3101e2eba9ffc0dc4309000189ca0 --- /dev/null +++ b/connectors/azure_handler.py @@ -0,0 +1,32 @@ +from azure.storage.blob import BlobServiceClient + +class AzureBlobHandler: + def __init__(self): + print("✅ Azure Blob Handler loaded.") + + def get_containers(self, conn_str): + try: + blob_service_client = BlobServiceClient.from_connection_string(conn_str) + containers = blob_service_client.list_containers() + return [c['name'] for c in containers] + except Exception as e: + print(f"❌ Azure Error: {e}") + return [] + + def get_blobs(self, conn_str, container_name): + try: + blob_service_client = BlobServiceClient.from_connection_string(conn_str) + container_client = blob_service_client.get_container_client(container_name) + blobs = container_client.list_blobs() + return [b['name'] for b in blobs] + except Exception as e: + return [] + + def download_blob(self, conn_str, container_name, blob_name): + try: + blob_service_client = BlobServiceClient.from_connection_string(conn_str) + blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name) + return blob_client.download_blob().readall() + except Exception as e: + print(f"❌ Azure Download Error: {e}") + return b"" \ No newline at end of file diff --git a/connectors/confluence_handler.py b/connectors/confluence_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..8411668b19e88b54f65794104f01c70346a241c6 --- /dev/null +++ b/connectors/confluence_handler.py @@ -0,0 +1,44 @@ +import pandas as pd +from atlassian import Confluence +from bs4 import BeautifulSoup + +class ConfluenceHandler: + def __init__(self): + print("✅ Confluence Handler loaded.") + + def fetch_page_content(self, url, username, api_token, page_id): + """ + Fetches the body content of a specific Confluence page. + """ + try: + # Initialize Confluence API + confluence = Confluence( + url=url, + username=username, + password=api_token, + cloud=True + ) + + # Get Page Content + page = confluence.get_page_by_id(page_id, expand='body.storage') + title = page.get('title', 'Unknown Title') + + # Extract HTML body + raw_html = page.get('body', {}).get('storage', {}).get('value', '') + + # Clean HTML tags to get raw text for PII scanning + if raw_html: + clean_text = BeautifulSoup(raw_html, "html.parser").get_text(separator=' ') + else: + clean_text = "" + + return pd.DataFrame([{ + "Source": "Confluence", + "Sender": username, + "Subject": title, + "Content": clean_text + }]) + + except Exception as e: + print(f"❌ Confluence Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/connectors/drive_handler.py b/connectors/drive_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..881b31539d5cc52258cf9ed84aa0843ae6769296 --- /dev/null +++ b/connectors/drive_handler.py @@ -0,0 +1,52 @@ +import io +import json +from googleapiclient.discovery import build +from googleapiclient.http import MediaIoBaseDownload +from google.oauth2 import service_account + +class DriveHandler: + def __init__(self): + print("✅ Google Drive Handler loaded.") + + def list_files(self, credentials_dict): + try: + creds = service_account.Credentials.from_service_account_info( + credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly'] + ) + service = build('drive', 'v3', credentials=creds) + results = service.files().list( + pageSize=15, fields="files(id, name, mimeType)" + ).execute() + return results.get('files', []) + except Exception as e: + print(f"❌ Drive List Error: {e}") + return [] + + def download_file(self, file_id, mime_type, credentials_dict) -> bytes: + try: + creds = service_account.Credentials.from_service_account_info( + credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly'] + ) + service = build('drive', 'v3', credentials=creds) + + # Export Google Docs to standard formats + if "spreadsheet" in mime_type: + request = service.files().export_media(fileId=file_id, mimeType='text/csv') + elif "document" in mime_type: + request = service.files().export_media(fileId=file_id, mimeType='application/pdf') + elif "presentation" in mime_type: + request = service.files().export_media(fileId=file_id, mimeType='application/pdf') + else: + # Download binary files directly + request = service.files().get_media(fileId=file_id) + + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request) + done = False + while done is False: + status, done = downloader.next_chunk() + + return fh.getvalue() + except Exception as e: + print(f"❌ Drive Download Error: {e}") + return b"" \ No newline at end of file diff --git a/connectors/gcp_storage_handler.py b/connectors/gcp_storage_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..4de70fb00368aa8d00811cc5abea4cece40e056b --- /dev/null +++ b/connectors/gcp_storage_handler.py @@ -0,0 +1,37 @@ +from google.cloud import storage +from google.oauth2 import service_account + +class GcpStorageHandler: + def __init__(self): + print("✅ GCP Storage Handler loaded.") + + def get_buckets(self, credentials_dict): + try: + credentials = service_account.Credentials.from_service_account_info(credentials_dict) + storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id')) + buckets = storage_client.list_buckets() + return [bucket.name for bucket in buckets] + except Exception as e: + print(f"❌ GCP Bucket Error: {e}") + return [] + + def get_files(self, credentials_dict, bucket_name): + try: + credentials = service_account.Credentials.from_service_account_info(credentials_dict) + storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id')) + blobs = storage_client.list_blobs(bucket_name) + return [blob.name for blob in blobs] + except Exception as e: + print(f"❌ GCP List Error: {e}") + return [] + + def download_file(self, credentials_dict, bucket_name, blob_name): + try: + credentials = service_account.Credentials.from_service_account_info(credentials_dict) + storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id')) + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(blob_name) + return blob.download_as_bytes() + except Exception as e: + print(f"❌ GCP Download Error: {e}") + return b"" \ No newline at end of file diff --git a/connectors/gmail_handler.py b/connectors/gmail_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..7b845f3aea90a5e5aed00da58fa5bc25b21b5a2f --- /dev/null +++ b/connectors/gmail_handler.py @@ -0,0 +1,77 @@ +import base64 +import os +import pickle +import pandas as pd +from bs4 import BeautifulSoup +from googleapiclient.discovery import build +from google_auth_oauthlib.flow import InstalledAppFlow +from google.auth.transport.requests import Request + +class GmailHandler: + def __init__(self): + print("✅ Gmail Handler loaded.") + + def fetch_emails(self, credentials_file, num_emails=10) -> pd.DataFrame: + """ + Authenticates and fetches emails from Gmail. + """ + try: + SCOPES = ['https://www.googleapis.com/auth/gmail.readonly'] + creds = None + token_path = 'token.pickle' + + if os.path.exists(token_path): + with open(token_path, 'rb') as token: + creds = pickle.load(token) + + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + # Write temp file because flow requires file path + with open("temp_client_secret.json", "wb") as f: + f.write(credentials_file.getvalue()) + + flow = InstalledAppFlow.from_client_secrets_file('temp_client_secret.json', SCOPES) + creds = flow.run_local_server(port=0) + + with open(token_path, 'wb') as token: + pickle.dump(creds, token) + + if os.path.exists("temp_client_secret.json"): + os.remove("temp_client_secret.json") + + service = build('gmail', 'v1', credentials=creds) + results = service.users().messages().list(userId='me', maxResults=num_emails).execute() + messages = results.get('messages', []) + + email_data = [] + for message in messages: + msg = service.users().messages().get(userId='me', id=message['id']).execute() + payload = msg['payload'] + headers = payload.get("headers") + + subject = next((h['value'] for h in headers if h['name'] == 'Subject'), "No Subject") + sender = next((h['value'] for h in headers if h['name'] == 'From'), "Unknown") + + body = "" + if 'parts' in payload: + for part in payload['parts']: + if part['mimeType'] == 'text/plain' and 'data' in part['body']: + body += base64.urlsafe_b64decode(part['body']['data']).decode() + elif 'body' in payload and 'data' in payload['body']: + body += base64.urlsafe_b64decode(payload['body']['data']).decode() + + clean_body = BeautifulSoup(body, "html.parser").get_text() + email_data.append({ + "Source": "Gmail", + "Sender": sender, + "Subject": subject, + "Content": f"Subject: {subject}\n\n{clean_body}" + }) + + return pd.DataFrame(email_data) + + except Exception as e: + print(f"❌ Gmail Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/connectors/mongo_handler.py b/connectors/mongo_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..f35430acfb15c44ba272024780af5160e3c616af --- /dev/null +++ b/connectors/mongo_handler.py @@ -0,0 +1,45 @@ +import pandas as pd +from urllib.parse import quote_plus + +class MongoHandler: + def __init__(self): + try: + import pymongo + self.pymongo = pymongo + print("✅ MongoDB Handler loaded.") + except ImportError: + self.pymongo = None + print("❌ PyMongo not installed.") + + def fetch_data(self, host, port, db, user, pw, collection): + if not self.pymongo: + return pd.DataFrame() + + try: + if user and pw: + safe_user = quote_plus(user) + safe_pw = quote_plus(pw) + uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/" + else: + uri = f"mongodb://{host}:{port}/" + + client = self.pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000) + # Check connection + client.server_info() + + cursor = client[db][collection].find().limit(100) + data = list(cursor) + + if not data: + return pd.DataFrame() + + # Normalize ObjectIds to strings + for d in data: + if '_id' in d: + d['_id'] = str(d['_id']) + + return pd.json_normalize(data) + + except Exception as e: + print(f"❌ Mongo Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/connectors/mysql_handler.py b/connectors/mysql_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..8dfc0785fbd28a54dcf6192fadeb9a3c79d13679 --- /dev/null +++ b/connectors/mysql_handler.py @@ -0,0 +1,23 @@ +import pandas as pd +from sqlalchemy import create_engine +from urllib.parse import quote_plus + +class MysqlHandler: + def __init__(self): + print("✅ MySQL Handler loaded.") + + def fetch_data(self, host, port, db, user, pw, table): + """ + Connects to MySQL and fetches the first 100 rows of a table. + """ + try: + safe_pw = quote_plus(pw) + # Uses mysql+pymysql driver + conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}" + engine = create_engine(conn_str) + + query = f"SELECT * FROM {table} LIMIT 100" + return pd.read_sql(query, engine) + except Exception as e: + print(f"❌ MySQL Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/connectors/postgres_handler.py b/connectors/postgres_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..69244b4c779e0c3d50c059243ca7a247cc79cba8 --- /dev/null +++ b/connectors/postgres_handler.py @@ -0,0 +1,23 @@ +import pandas as pd +from sqlalchemy import create_engine +from urllib.parse import quote_plus + +class PostgresHandler: + def __init__(self): + print("✅ PostgreSQL Handler loaded.") + + def fetch_data(self, host, port, db, user, pw, table): + """ + Connects to PostgreSQL and fetches the first 100 rows of a table. + """ + try: + safe_pw = quote_plus(pw) + # SQLAlchemy connection string + conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}" + engine = create_engine(conn_str) + + query = f"SELECT * FROM {table} LIMIT 100" + return pd.read_sql(query, engine) + except Exception as e: + print(f"❌ PostgreSQL Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/connectors/slack_handler.py b/connectors/slack_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..7e73b982283e32d659dfd4aa7433f935bce21772 --- /dev/null +++ b/connectors/slack_handler.py @@ -0,0 +1,47 @@ +import pandas as pd +from slack_sdk import WebClient +from slack_sdk.errors import SlackApiError +import datetime + +class SlackHandler: + def __init__(self): + print("✅ Slack Handler loaded.") + + def fetch_messages(self, token, channel_id, num_messages=20): + """ + Fetches recent messages from a specific Slack channel. + """ + try: + client = WebClient(token=token) + # Fetch conversation history + response = client.conversations_history(channel=channel_id, limit=num_messages) + + messages = [] + if response['ok']: + for msg in response['messages']: + # Skip subtypes like 'channel_join', only process actual text + if 'subtype' not in msg: + user_id = msg.get('user', 'Unknown') + text = msg.get('text', '') + ts = float(msg.get('ts', 0)) + time_str = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') + + messages.append({ + "Source": "Slack", + "Sender": user_id, + "Subject": f"Message in {channel_id} at {time_str}", + "Content": text + }) + + if not messages: + print("⚠️ No messages found in channel.") + return pd.DataFrame() + + return pd.DataFrame(messages) + + except SlackApiError as e: + print(f"❌ Slack API Error: {e.response['error']}") + return pd.DataFrame() + except Exception as e: + print(f"❌ Slack Handler Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/file_handlers/__init__.py b/file_handlers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/file_handlers/__pycache__/__init__.cpython-313.pyc b/file_handlers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8442ed10e38a6901629379f42b944c5831236d2 Binary files /dev/null and b/file_handlers/__pycache__/__init__.cpython-313.pyc differ diff --git a/file_handlers/__pycache__/avro_handler.cpython-313.pyc b/file_handlers/__pycache__/avro_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adfe8a82a5af893d5bf425fd0b299614c6f457d9 Binary files /dev/null and b/file_handlers/__pycache__/avro_handler.cpython-313.pyc differ diff --git a/file_handlers/__pycache__/json_handler.cpython-313.pyc b/file_handlers/__pycache__/json_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..337a40ed119538367062e58aa8063fa5e4bbbfa4 Binary files /dev/null and b/file_handlers/__pycache__/json_handler.cpython-313.pyc differ diff --git a/file_handlers/__pycache__/ocr_engine.cpython-313.pyc b/file_handlers/__pycache__/ocr_engine.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c1463f41999a9ecbd01865f527dcd3086c254ff Binary files /dev/null and b/file_handlers/__pycache__/ocr_engine.cpython-313.pyc differ diff --git a/file_handlers/__pycache__/parquet_handler.cpython-313.pyc b/file_handlers/__pycache__/parquet_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea190a2ad66cc6721e6971b5fed7e3a03a93054d Binary files /dev/null and b/file_handlers/__pycache__/parquet_handler.cpython-313.pyc differ diff --git a/file_handlers/__pycache__/pdf_handler.cpython-313.pyc b/file_handlers/__pycache__/pdf_handler.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9607b9dd3b2af360d05ab2a1fc407a1f035245c Binary files /dev/null and b/file_handlers/__pycache__/pdf_handler.cpython-313.pyc differ diff --git a/file_handlers/avro_handler.py b/file_handlers/avro_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..14bc33418b8340c8a7e382f5b231d29cf3e529bc --- /dev/null +++ b/file_handlers/avro_handler.py @@ -0,0 +1,36 @@ +# avro_handler.py +import io +import pandas as pd + +class AvroHandler: + def __init__(self): + self.available = False + try: + import fastavro + self.fastavro = fastavro + self.available = True + print("✅ Avro Handler loaded.") + except ImportError: + print("❌ fastavro not found. Please run: pip install fastavro") + + def convert_to_dataframe(self, file_bytes: bytes) -> pd.DataFrame: + """ + Reads Avro bytes and converts them to a Pandas DataFrame. + """ + if not self.available: + return pd.DataFrame() + + try: + # Create a file-like object from bytes + f = io.BytesIO(file_bytes) + # Use fastavro to read records + reader = self.fastavro.reader(f) + records = [r for r in reader] + + if not records: + return pd.DataFrame() + + return pd.DataFrame(records) + except Exception as e: + print(f"⚠️ Avro Read Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/file_handlers/json_handler.py b/file_handlers/json_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..2736d9166fbfa5190751f6e4c3359f738d3cbd9f --- /dev/null +++ b/file_handlers/json_handler.py @@ -0,0 +1,39 @@ +import json +import pandas as pd +import io + +class JsonHandler: + def __init__(self): + print("✅ JSON Handler loaded.") + + def read_file(self, file_obj) -> pd.DataFrame: + """ + Reads a JSON file object (or Streamlit UploadedFile) and flattens it. + """ + try: + # Handle Streamlit UploadedFile (bytes) vs standard file path + if hasattr(file_obj, "getvalue"): + content = file_obj.getvalue() + data = json.loads(content.decode('utf-8')) + else: + data = json.load(file_obj) + + # Recursive function to flatten nested JSONs + def flatten(x, name=''): + if type(x) is dict: + out = {} + for a in x: out.update(flatten(x[a], name + a + '_')) + return out + elif type(x) is list: + return {f"{name}list": str(x)} + else: return {name[:-1]: x} + + # Normalize to DataFrame + if isinstance(data, list): + return pd.DataFrame([flatten(x) for x in data]) + + return pd.DataFrame([flatten(data)]) + + except Exception as e: + print(f"❌ JSON Read Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/file_handlers/ocr_engine.py b/file_handlers/ocr_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..ee6054b74213ce70f86e95063b7372b28322f17b --- /dev/null +++ b/file_handlers/ocr_engine.py @@ -0,0 +1,35 @@ +# ocr_engine.py +import pytesseract +from PIL import Image +import io + +class OcrEngine: + def __init__(self): + """ + Initializes the OCR Engine using Tesseract. + """ + self.available = False + try: + # Check availability by querying version + pytesseract.get_tesseract_version() + print("✅ Tesseract OCR Engine loaded.") + self.available = True + except Exception as e: + print(f"❌ Tesseract OCR not found: {e}") + print("👉 Install Tesseract system-wide (e.g., 'apt-get install tesseract-ocr') and 'pip install pytesseract'.") + + def extract_text(self, image_bytes: bytes) -> str: + """ + Converts image bytes to text. + """ + if not self.available: + return "" + + try: + image = Image.open(io.BytesIO(image_bytes)) + # Perform OCR + text = pytesseract.image_to_string(image) + return text + except Exception as e: + print(f"⚠️ OCR Extraction Error: {e}") + return "" \ No newline at end of file diff --git a/file_handlers/parquet_handler.py b/file_handlers/parquet_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..009823af582d64da0f2629b444d6e9ba5901d43c --- /dev/null +++ b/file_handlers/parquet_handler.py @@ -0,0 +1,25 @@ +import io +import pandas as pd + +class ParquetHandler: + def __init__(self): + self.available = False + try: + import pyarrow.parquet as pq + self.available = True + print("✅ Parquet Handler loaded.") + except ImportError: + print("❌ PyArrow not found. Please run: pip install pyarrow") + + def convert_to_dataframe(self, file_bytes: bytes) -> pd.DataFrame: + """ + Reads Parquet bytes and converts them to a Pandas DataFrame. + """ + if not self.available: + return pd.DataFrame() + + try: + return pd.read_parquet(io.BytesIO(file_bytes)) + except Exception as e: + print(f"⚠️ Parquet Read Error: {e}") + return pd.DataFrame() \ No newline at end of file diff --git a/file_handlers/pdf_handler.py b/file_handlers/pdf_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..a46e2597bb58e51ced02c4412a4f901ba8a70279 --- /dev/null +++ b/file_handlers/pdf_handler.py @@ -0,0 +1,74 @@ +import fitz # PyMuPDF +import io + +class PdfHandler: + def __init__(self, ocr_engine): + """ + :param ocr_engine: Instance of OcrEngine to handle scanned pages. + """ + self.ocr_engine = ocr_engine + print("✅ PDF Handler loaded.") + + def get_total_pages(self, file_bytes: bytes) -> int: + try: + doc = fitz.open(stream=file_bytes, filetype="pdf") + return len(doc) + except: + return 0 + + def get_page_text(self, file_bytes: bytes, page_num: int) -> str: + """ + Extracts text from a specific page. Falls back to OCR if text is empty. + """ + try: + doc = fitz.open(stream=file_bytes, filetype="pdf") + if not (0 <= page_num < len(doc)): return "" + + page = doc[page_num] + text = page.get_text("text") + + # OCR Fallback for scanned PDFs + if not text.strip() and self.ocr_engine.available: + print(f"⚠️ Page {page_num+1} appears empty/scanned. Running OCR...") + pix = page.get_pixmap() + img_bytes = pix.tobytes("png") + text = self.ocr_engine.extract_text(img_bytes) + + return text + except Exception as e: + print(f"PDF Text Error: {e}") + return "" + + def render_labeled_image(self, file_bytes: bytes, page_num: int, matches: list, color_map: dict) -> bytes: + """ + Draws bounding boxes around detected PII on the PDF page image. + """ + try: + doc = fitz.open(stream=file_bytes, filetype="pdf") + if not (0 <= page_num < len(doc)): return None + + page = doc[page_num] + + # Draw rectangles for each match + for m in matches: + # Get color for this PII type (normalize 0-255 rgb to 0-1 for PyMuPDF) + # color_map values are hex strings or tuples. Assuming the backend passes hex or we default. + # Simplification: Use Red for all boxes for visibility, or logic below: + color_norm = (1, 0, 0) # Default Red + + # Search for the text string on the page + quads = page.search_for(m['text']) + + for q in quads: + # Draw Box + page.draw_rect(q, color=color_norm, width=1.5, fill=color_norm, fill_opacity=0.2) + # Add Label + page.insert_text(fitz.Point(q.x0, q.y0-2), m['label'], fontsize=6, color=(0,0,0)) + + # Render page to image + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Zoom=2 for higher quality + return pix.tobytes("png") + + except Exception as e: + print(f"PDF Render Error: {e}") + return None \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ebab1ca6e4d7ea86dd1faadde8ca22ab98eb8c23..a71b6e41a3778f06becb6c80def5dbe0ecc7a0f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ pypdf +pymupdf +beautifulsoup4 pandas streamlit nltk @@ -14,4 +16,10 @@ spacy boto3 presidio-analyzer azure-storage-blob -google-cloud-storage \ No newline at end of file +google-cloud-storage +pytesseract +Pillow +fastavro +gliner +slack_sdk +atlassian-python-api \ No newline at end of file diff --git a/token.pickle b/token.pickle new file mode 100644 index 0000000000000000000000000000000000000000..e26fa18b87413893ddedeb225f4d9301f02a9d84 Binary files /dev/null and b/token.pickle differ diff --git a/ui.py b/ui.py index 26d57dd4048e50e7e1fe86371ba21cc02fbd51b7..d8e168bc0a67cead5572838c7dd2bb49204a51ba 100644 --- a/ui.py +++ b/ui.py @@ -13,61 +13,14 @@ def main(): st.session_state.classifier = RegexClassifier() if 'page_number' not in st.session_state: st.session_state.page_number = 0 - if 'last_accuracy' not in st.session_state: - st.session_state.last_accuracy = {"🛠️ Regex": 0, "🧠 NLTK": 0, "🤖 SpaCy": 0, "🛡️ Presidio": 0} + st.session_state.last_accuracy = {"🛠️ Regex": 0, "🧠 NLTK": 0, "🤖 SpaCy": 0, "🛡️ Presidio": 0, "🦅 GLiNER": 0} classifier = st.session_state.classifier - with st.sidebar: - st.header("1. Source Selection") - main_category = st.selectbox("Select System", ["File System", "Databases", "Cloud Storage"]) - source = None - file_sub_type = None - - if main_category == "File System": - struct_type = st.radio("Data Type", ["Structured Data", "Unstructured Data"]) - if struct_type == "Structured Data": - file_sub_type = st.selectbox("File Format", ["CSV", "JSON", "Parquet"]) - else: - file_sub_type = st.selectbox("File Format", ["PDF"]) - source = "File Upload" - - elif main_category == "Databases": - db_type = st.radio("Database Type", ["Relational (SQL)", "Non-Relational (NoSQL)"]) - if db_type == "Relational (SQL)": - db_icons = {"PostgreSQL": "🐘 PostgreSQL", "MySQL": "🐬 MySQL"} - source = st.selectbox("Select Database", ["PostgreSQL", "MySQL"], format_func=lambda x: db_icons.get(x)) - else: - db_icons = {"MongoDB": "🍃 MongoDB"} - source = st.selectbox("Select Database", ["MongoDB"], format_func=lambda x: db_icons.get(x)) - - elif main_category == "Cloud Storage": - source = st.selectbox("Service", ["Google Drive", "AWS S3", "Azure Blob Storage", "Google Cloud Storage"]) - - st.divider() - st.header("2. Patterns") - patterns = classifier.list_patterns() - ordered_keys = ["EMAIL", "FIRST_NAME", "LAST_NAME", "PHONE", "SSN", "CREDIT_CARD"] - display_patterns = {k: patterns.get(k, "NLTK/SpaCy/Presidio") for k in ordered_keys if k in patterns or k in ["FIRST_NAME", "LAST_NAME"]} - for k, v in patterns.items(): - if k not in display_patterns: display_patterns[k] = v - st.dataframe(pd.DataFrame(list(display_patterns.items()), columns=["Name", "Regex/Method"]), hide_index=True) - - with st.expander("➕ Add Pattern"): - new_name = st.text_input("Name") - new_regex = st.text_input("Regex") - if st.button("Add"): - classifier.add_pattern(new_name, new_regex) - st.rerun() - - with st.expander("🗑️ Remove Pattern"): - pattern_to_remove = st.selectbox("Select Pattern", options=list(patterns.keys())) - if st.button("Remove Selected"): - classifier.remove_pattern(pattern_to_remove) - st.rerun() - + # ================= HELPER FUNCTIONS ================= def render_source_header(title, logo_url): + """Helper to render headers consistently""" col1, col2 = st.columns([0.1, 0.9]) with col1: if logo_url: st.image(logo_url, width=50) @@ -83,15 +36,12 @@ def main(): if results_df.empty: st.info("No PII detected by any model.") return - display_df = results_df[["Model", "Detected PII", "Missed PII"]] - st.table(display_df) + st.table(results_df[["Model", "Detected PII", "Missed PII"]]) col1, col2 = st.columns([2, 1]) with col1: - st.markdown("**Model Accuracy Graph**") fig = px.bar(results_df, x="Accuracy", y="Model", orientation='h', color="Model", text_auto='.2%', range_x=[0,1]) st.plotly_chart(fig, use_container_width=True) with col2: - st.markdown("**Efficiency Gain**") for index, row in results_df.iterrows(): model = row['Model'] current_acc = row['Accuracy'] @@ -104,8 +54,7 @@ def main(): if source_df is not None and not source_df.empty: st.markdown("### 🧬 Data Schema Detected") with st.expander("View Column Types & Samples", expanded=False): - schema_df = classifier.get_data_schema(source_df) - st.dataframe(schema_df, use_container_width=True, hide_index=True) + st.dataframe(classifier.get_data_schema(source_df), use_container_width=True, hide_index=True) st.divider() st.markdown("### 📊 PII Analytics") if count_df.empty: @@ -118,43 +67,100 @@ def main(): with c2: st.dataframe(count_df, hide_index=True, use_container_width=True) - if source == "File Upload": - ext_map = {"PDF": ["pdf"], "CSV": ["csv"], "JSON": ["json"], "Parquet": ["parquet", "pqt"]} - accepted_exts = ext_map.get(file_sub_type, []) - st.subheader(f"📂 {file_sub_type} Analysis") - uploaded_file = st.file_uploader(f"Upload {file_sub_type}", type=accepted_exts) + # ================= SIDEBAR ================= + with st.sidebar: + st.header("1. Source Selection") + main_category = st.selectbox("Select System", ["File System", "Databases", "Cloud Storage", "Enterprise Connectors"]) + source = None + file_sub_type = None + if main_category == "File System": + struct_type = st.radio("Data Type", ["Structured Data", "Unstructured Data"]) + if struct_type == "Structured Data": + file_sub_type = st.selectbox("File Format", ["CSV", "JSON", "Parquet", "Apache Avro"]) + else: + file_sub_type = st.selectbox("File Format", ["PDF", "Image (OCR)"]) + source = "File Upload" + + elif main_category == "Databases": + db_type = st.radio("Database Type", ["Relational (SQL)", "Non-Relational (NoSQL)"]) + if db_type == "Relational (SQL)": + db_icons = {"PostgreSQL": "🐘 PostgreSQL", "MySQL": "🐬 MySQL"} + source = st.selectbox("Select Database", ["PostgreSQL", "MySQL"], format_func=lambda x: db_icons.get(x)) + else: + db_icons = {"MongoDB": "🍃 MongoDB"} + source = st.selectbox("Select Database", ["MongoDB"], format_func=lambda x: db_icons.get(x)) + elif main_category == "Cloud Storage": + source = st.selectbox("Service", ["Google Drive", "AWS S3", "Azure Blob Storage", "Google Cloud Storage"]) + + elif main_category == "Enterprise Connectors": + source = st.selectbox("Platform", ["Gmail", "Slack", "Confluence"]) + + st.divider() + st.header("2. Patterns") + patterns = classifier.list_patterns() + st.dataframe(pd.DataFrame(list(patterns.items()), columns=["Name", "Regex"]), hide_index=True) + + with st.expander("➕ Add Pattern"): + new_name = st.text_input("Name") + new_regex = st.text_input("Regex") + if st.button("Add"): + classifier.add_pattern(new_name, new_regex) + st.rerun() + with st.expander("🗑️ Remove Pattern"): + pattern_to_remove = st.selectbox("Select Pattern", options=list(patterns.keys())) + if st.button("Remove Selected"): + classifier.remove_pattern(pattern_to_remove) + st.rerun() + + # ================= MAIN LOGIC ================= + + # 1. FILE UPLOAD + if source == "File Upload": + uploaded_file = st.file_uploader(f"Upload {file_sub_type}") if uploaded_file: mask_mode = st.checkbox("🔒 Enable PII Masking") + file_bytes = uploaded_file.getvalue() + if file_sub_type == 'PDF': - file_bytes = uploaded_file.getvalue() - current_text = classifier.get_pdf_page_text(file_bytes, st.session_state.page_number) - count_df = classifier.get_pii_counts(current_text) + text = classifier.get_pdf_page_text(file_bytes, st.session_state.page_number) + count_df = classifier.get_pii_counts(text) render_analytics(count_df, None) - render_inspector(current_text) + render_inspector(text) total_pages = classifier.get_pdf_total_pages(file_bytes) - c1, c2, c3 = st.columns([1, 2, 1]) - with c1: - if st.button("⬅️ Prev") and st.session_state.page_number > 0: st.session_state.page_number -= 1 - with c3: - if st.button("Next ➡️") and st.session_state.page_number < total_pages - 1: st.session_state.page_number += 1 + c1, c2, c3 = st.columns([1,2,1]) + if c1.button("Prev"): st.session_state.page_number = max(0, st.session_state.page_number - 1) + if c3.button("Next"): st.session_state.page_number = min(total_pages-1, st.session_state.page_number + 1) st.markdown(f"**Viewing Page {st.session_state.page_number + 1} of {total_pages}**") img = classifier.get_labeled_pdf_image(file_bytes, st.session_state.page_number) if img: st.image(img, use_container_width=True) - else: - if file_sub_type == 'Parquet': df = classifier.get_parquet_data(uploaded_file.getvalue()) - elif file_sub_type == 'CSV': df = pd.read_csv(uploaded_file) - else: df = classifier.get_json_data(uploaded_file) + + elif file_sub_type == 'Image (OCR)': + st.image(uploaded_file, width=400) + with st.spinner("Running OCR..."): + text = classifier.get_ocr_text_from_image(file_bytes) + if text: + df = pd.DataFrame({"Content": [text]}) + render_analytics(classifier.get_pii_counts_dataframe(df), df) + render_inspector(text) + if mask_mode: st.dataframe(classifier.mask_dataframe(df)) + else: st.markdown(classifier.scan_dataframe_with_html(df).to_html(escape=False), unsafe_allow_html=True) + else: st.warning("No text extracted.") + else: # Structured + if file_sub_type == 'Parquet': df = classifier.get_parquet_data(file_bytes) + elif file_sub_type == 'Apache Avro': df = classifier.get_avro_data(file_bytes) + elif file_sub_type == 'CSV': df = pd.read_csv(io.BytesIO(file_bytes)) + else: df = classifier.get_json_data(uploaded_file) + render_analytics(classifier.get_pii_counts_dataframe(df), df) - sample_text = df.head(10).to_string() - render_inspector(sample_text) - + render_inspector(df.head(10).to_string()) if mask_mode: st.dataframe(classifier.mask_dataframe(df).head(50)) else: st.markdown(classifier.scan_dataframe_with_html(df.head(50)).to_html(escape=False), unsafe_allow_html=True) + # 2. DATABASES elif source in ["PostgreSQL", "MySQL", "MongoDB"]: db_logos = { "PostgreSQL": "https://upload.wikimedia.org/wikipedia/commons/2/29/Postgresql_elephant.svg", @@ -184,155 +190,162 @@ def main(): if 'db_data' in st.session_state: df = st.session_state.db_data render_analytics(classifier.get_pii_counts_dataframe(df), df) - sample_text = df.head(10).to_string() - render_inspector(sample_text) + render_inspector(df.head(10).to_string()) st.dataframe(classifier.mask_dataframe(df)) - elif source == "Google Drive": - render_source_header("Google Drive Import", "https://upload.wikimedia.org/wikipedia/commons/d/da/Google_Drive_logo.png") - st.info("Upload your Service Account JSON to connect dynamically.") - creds_file = st.file_uploader("Upload credentials.json", type=['json']) - if creds_file: - creds_dict = json.load(creds_file) - st.session_state.creds_dict = creds_dict - st.success("Credentials Loaded!") - if st.button("📂 List Files"): - st.session_state.drive_files = classifier.get_google_drive_files(creds_dict) - if 'drive_files' in st.session_state: - files = st.session_state.drive_files - if not files: st.warning("No files found.") - else: - file_map = {f['name']: f for f in files} - selected_name = st.selectbox("Select File", list(file_map.keys())) - if st.button("⬇️ Scan File"): - sel_file = file_map[selected_name] - content = classifier.download_drive_file(sel_file['id'], sel_file.get('mimeType', ''), st.session_state.creds_dict) - if not content: st.error("Failed to read.") - else: - st.success(f"Scanning {selected_name}...") - # Reuse scan logic ... - # (omitted for brevity, same as S3) + # 3. CLOUD STORAGE + elif source in ["Google Drive", "AWS S3", "Azure Blob Storage", "Google Cloud Storage"]: + logos = { + "Google Drive": "https://upload.wikimedia.org/wikipedia/commons/d/da/Google_Drive_logo.png", + "AWS S3": "https://upload.wikimedia.org/wikipedia/commons/9/93/Amazon_Web_Services_Logo.svg", + "Azure Blob Storage": "https://upload.wikimedia.org/wikipedia/commons/f/fa/Microsoft_Azure.svg", + "Google Cloud Storage": "https://upload.wikimedia.org/wikipedia/commons/5/51/Google_Cloud_logo.svg" + } + render_source_header(f"{source} Import", logos.get(source, "")) + + # --- GOOGLE DRIVE --- + if source == "Google Drive": + st.info("Upload Service Account JSON") + creds_file = st.file_uploader("credentials.json", type=['json'], key="gdrive") + if creds_file: + creds_dict = json.load(creds_file) + if st.button("📂 List Files"): + st.session_state.drive_files = classifier.get_google_drive_files(creds_dict) + st.session_state.gdrive_creds = creds_dict + + if 'drive_files' in st.session_state: + file_map = {f['name']: f for f in st.session_state.drive_files} + sel = st.selectbox("Select File", list(file_map.keys())) + if st.button("⬇️ Scan"): + content = classifier.download_drive_file(file_map[sel]['id'], file_map[sel]['mimeType'], st.session_state.gdrive_creds) + if isinstance(content, bytes): + try: + txt = content.decode('utf-8') + df = pd.DataFrame({"Content": [txt]}) + render_analytics(classifier.get_pii_counts_dataframe(df), df) + render_inspector(txt) + st.markdown(classifier.scan_dataframe_with_html(df).to_html(escape=False), unsafe_allow_html=True) + except: st.warning("Binary file downloaded.") - elif source == "AWS S3": - render_source_header("AWS S3 Import", "https://upload.wikimedia.org/wikipedia/commons/9/93/Amazon_Web_Services_Logo.svg") - c1, c2, c3 = st.columns(3) - aws_access = c1.text_input("Access Key ID") - aws_secret = c2.text_input("Secret Access Key", type="password") - aws_region = c3.text_input("Region", "us-east-1") + # --- AWS S3 --- + elif source == "AWS S3": + c1, c2, c3 = st.columns(3) + aws_a = c1.text_input("Access Key") + aws_s = c2.text_input("Secret Key", type="password") + aws_r = c3.text_input("Region", "us-east-1") + if st.button("Connect"): + st.session_state.s3_buckets = classifier.get_s3_buckets(aws_a, aws_s, aws_r) + st.session_state.aws_creds = (aws_a, aws_s, aws_r) + + if 's3_buckets' in st.session_state: + sel_b = st.selectbox("Bucket", st.session_state.s3_buckets) + if st.button("List"): + st.session_state.s3_files = classifier.get_s3_files(*st.session_state.aws_creds, sel_b) + if 's3_files' in st.session_state: + sel_f = st.selectbox("File", st.session_state.s3_files) + if st.button("Scan"): + content = classifier.download_s3_file(*st.session_state.aws_creds, sel_b, sel_f) + try: + df = pd.read_csv(io.BytesIO(content)) + render_analytics(classifier.get_pii_counts_dataframe(df), df) + st.dataframe(classifier.mask_dataframe(df)) + except: st.error("Only CSV supported.") - if st.button("🔗 Connect to AWS"): - buckets = classifier.get_s3_buckets(aws_access, aws_secret, aws_region) - if buckets: - st.session_state.aws_creds = (aws_access, aws_secret, aws_region) - st.session_state.s3_buckets = buckets - st.success(f"Connected! Found {len(buckets)} buckets.") - else: st.error("Connection Failed.") + # --- AZURE --- + elif source == "Azure Blob Storage": + conn = st.text_input("Connection String", type="password") + if st.button("Connect"): + st.session_state.az_conts = classifier.get_azure_containers(conn) + st.session_state.az_conn = conn + + if 'az_conts' in st.session_state: + sel_c = st.selectbox("Container", st.session_state.az_conts) + if st.button("List"): + st.session_state.az_blobs = classifier.get_azure_blobs(st.session_state.az_conn, sel_c) + if 'az_blobs' in st.session_state: + sel_b = st.selectbox("Blob", st.session_state.az_blobs) + if st.button("Scan"): + content = classifier.download_azure_blob(st.session_state.az_conn, sel_c, sel_b) + try: + df = pd.read_csv(io.BytesIO(content)) + render_analytics(classifier.get_pii_counts_dataframe(df), df) + st.dataframe(classifier.mask_dataframe(df)) + except: st.error("Only CSV supported.") - if 's3_buckets' in st.session_state: - selected_bucket = st.selectbox("Select Bucket", st.session_state.s3_buckets) - if st.button("📂 List Files"): - creds = st.session_state.aws_creds - st.session_state.s3_files = classifier.get_s3_files(creds[0], creds[1], creds[2], selected_bucket) + # --- GCP --- + elif source == "Google Cloud Storage": + st.info("Upload Service Account JSON") + gcp_f = st.file_uploader("service-account.json", type=['json'], key="gcp") + if gcp_f: + creds = json.load(gcp_f) + if st.button("Connect"): + st.session_state.gcp_buckets = classifier.get_gcs_buckets(creds) + st.session_state.gcp_creds = creds - if 's3_files' in st.session_state and st.session_state.s3_files: - selected_file = st.selectbox("Select File", st.session_state.s3_files) - if st.button("⬇️ Download & Scan"): - creds = st.session_state.aws_creds - file_content = classifier.download_s3_file(creds[0], creds[1], creds[2], selected_bucket, selected_file) - # ... run scan logic ... + if 'gcp_buckets' in st.session_state: + sel_b = st.selectbox("Bucket", st.session_state.gcp_buckets) + if st.button("List"): + st.session_state.gcp_files = classifier.get_gcs_files(st.session_state.gcp_creds, sel_b) + if 'gcp_files' in st.session_state: + sel_f = st.selectbox("File", st.session_state.gcp_files) + if st.button("Scan"): + content = classifier.download_gcs_file(st.session_state.gcp_creds, sel_b, sel_f) + try: + df = pd.read_csv(io.BytesIO(content)) + render_analytics(classifier.get_pii_counts_dataframe(df), df) + st.dataframe(classifier.mask_dataframe(df)) + except: st.error("Only CSV supported.") - elif source == "Azure Blob Storage": - render_source_header("Azure Blob Storage Import", "https://upload.wikimedia.org/wikipedia/commons/f/fa/Microsoft_Azure.svg") + # 4. ENTERPRISE CONNECTORS (NEW) + elif source == "Gmail": + render_source_header("Gmail Scanner", "https://upload.wikimedia.org/wikipedia/commons/7/7e/Gmail_icon_%282020%29.svg") + st.info("Upload your OAuth 2.0 Client Secret JSON (Desktop App).") + uploaded_file = st.file_uploader("Upload client_secret.json", type=['json'], key="gmail_secret") + num_emails = st.slider("Number of recent emails to scan", 5, 50, 10) - st.info("Get your Connection String from Azure Portal -> Storage Account -> Access keys.") - conn_str = st.text_input("Connection String", type="password") + if uploaded_file and st.button("Authenticate & Scan"): + with st.spinner("Authenticating..."): + try: + df = classifier.get_gmail_data(uploaded_file, num_emails) + if not df.empty: + st.session_state.gmail_data = df + st.success("Fetched!") + except Exception as e: st.error(f"Error: {e}") - if st.button("🔗 Connect to Azure"): - containers = classifier.get_azure_containers(conn_str) - if containers: - st.session_state.azure_conn = conn_str - st.session_state.azure_containers = containers - st.success(f"Connected! Found {len(containers)} containers.") - else: - st.error("Connection Failed. Check your string.") + if 'gmail_data' in st.session_state: + df = st.session_state.gmail_data + render_analytics(classifier.get_pii_counts_dataframe(df), df) + render_inspector(df.iloc[0]['Content']) + st.dataframe(classifier.mask_dataframe(df)) - if 'azure_containers' in st.session_state: - selected_container = st.selectbox("Select Container", st.session_state.azure_containers) - if st.button("📂 List Blobs"): - st.session_state.azure_blobs = classifier.get_azure_blobs(st.session_state.azure_conn, selected_container) - if 'azure_blobs' in st.session_state and st.session_state.azure_blobs: - selected_blob = st.selectbox("Select Blob", st.session_state.azure_blobs) - if st.button("⬇️ Download & Scan"): - file_content = classifier.download_azure_blob(st.session_state.azure_conn, selected_container, selected_blob) - # ... run scan logic ... + elif source == "Slack": + render_source_header("Slack Scanner", "https://upload.wikimedia.org/wikipedia/commons/d/d5/Slack_icon_2019.svg") + token = st.text_input("Bot User OAuth Token (xoxb-...)") + channel = st.text_input("Channel ID") + if st.button("Scan Channel"): + with st.spinner("Fetching messages..."): + df = classifier.get_slack_messages(token, channel) + if not df.empty: + render_analytics(classifier.get_pii_counts_dataframe(df), df) + render_inspector(df.iloc[0]['Content']) + st.dataframe(classifier.mask_dataframe(df)) + else: st.error("No messages found or auth failed.") - # --- GCP BUCKETS LOGIC (NEW) --- - elif source == "Google Cloud Storage": - render_source_header("Google Cloud Storage Import", "https://upload.wikimedia.org/wikipedia/commons/5/51/Google_Cloud_logo.svg") + elif source == "Confluence": + render_source_header("Confluence Scanner", "https://upload.wikimedia.org/wikipedia/commons/8/88/Atlassian_Confluence_Logo.svg") + url = st.text_input("Confluence URL (https://your-domain.atlassian.net)") + user = st.text_input("Username (Email)") + token = st.text_input("API Token", type="password") + page_id = st.text_input("Page ID") - st.info("Upload your GCP Service Account JSON key (must have Storage Object Viewer role).") - gcp_creds_file = st.file_uploader("Upload service-account.json", type=['json'], key="gcp_upload") - - if gcp_creds_file: - gcp_creds = json.load(gcp_creds_file) - st.session_state.gcp_creds = gcp_creds - st.success("GCP Credentials Loaded!") - - if st.button("🔗 Connect & List Buckets"): - buckets = classifier.get_gcs_buckets(gcp_creds) - if buckets: - st.session_state.gcs_buckets = buckets - st.success(f"Connected! Found {len(buckets)} buckets.") - else: - st.error("Connection Failed or No Buckets found.") - - if 'gcs_buckets' in st.session_state: - selected_bucket = st.selectbox("Select Bucket", st.session_state.gcs_buckets) - - if st.button("📂 List Files in Bucket"): - st.session_state.gcs_files = classifier.get_gcs_files(st.session_state.gcp_creds, selected_bucket) - - if 'gcs_files' in st.session_state and st.session_state.gcs_files: - selected_file = st.selectbox("Select File", st.session_state.gcs_files) - - if st.button("⬇️ Download & Scan"): - file_content = classifier.download_gcs_file(st.session_state.gcp_creds, selected_bucket, selected_file) - - if not file_content: - st.error("Failed to download file.") - else: - st.success(f"Scanning {selected_file}...") - mask_mode = st.checkbox("🔒 Mask Results", value=False, key="gcs_mask") - - ext = selected_file.split('.')[-1].lower() - - # Reuse Scan Logic (Same as AWS/Azure) - if ext == 'pdf': - text = classifier.get_pdf_page_text(file_content, 0) - render_analytics(classifier.get_pii_counts(text), None) - render_inspector(text) - img = classifier.get_labeled_pdf_image(file_content, 0) - if img: st.image(img, caption="Page 1 Preview") - elif ext == 'csv': - df = pd.read_csv(io.BytesIO(file_content)) - render_analytics(classifier.get_pii_counts_dataframe(df), df) - render_inspector(df.head(10).to_string()) - if mask_mode: st.dataframe(classifier.mask_dataframe(df)) - else: st.markdown(classifier.scan_dataframe_with_html(df).to_html(escape=False), unsafe_allow_html=True) - elif ext == 'json': - df = classifier.get_json_data(io.BytesIO(file_content)) - render_analytics(classifier.get_pii_counts_dataframe(df), df) - render_inspector(df.head(10).to_string()) - if mask_mode: st.dataframe(classifier.mask_dataframe(df)) - else: st.markdown(classifier.scan_dataframe_with_html(df).to_html(escape=False), unsafe_allow_html=True) - elif ext in ['parquet', 'pqt']: - df = classifier.get_parquet_data(file_content) - render_analytics(classifier.get_pii_counts_dataframe(df), df) - render_inspector(df.head(10).to_string()) - if mask_mode: st.dataframe(classifier.mask_dataframe(df)) - else: st.markdown(classifier.scan_dataframe_with_html(df).to_html(escape=False), unsafe_allow_html=True) - elif 'gcs_files' in st.session_state: - st.warning("Bucket is empty.") + if st.button("Scan Page"): + with st.spinner("Fetching page..."): + df = classifier.get_confluence_page(url, user, token, page_id) + if not df.empty: + render_analytics(classifier.get_pii_counts_dataframe(df), df) + render_inspector(df.iloc[0]['Content']) + st.markdown(classifier.scan_dataframe_with_html(df).to_html(escape=False), unsafe_allow_html=True) + else: st.error("Failed to fetch page.") if __name__ == "__main__": main() \ No newline at end of file