Spaces:

WORKWITHSHAFISK
/

Segmento-Sense-Backend

Paused

App Files Files Community

SHAFI commited on Dec 27, 2025

Commit

cf4e4d4

1 Parent(s): a8a38df

second commit with refactored code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

__pycache__/avro_handler.cpython-313.pyc +0 -0
__pycache__/backend.cpython-313.pyc +0 -0
__pycache__/gliner_model.cpython-313.pyc +0 -0
__pycache__/inspector.cpython-313.pyc +0 -0
__pycache__/ocr_engine.cpython-313.pyc +0 -0
api.py +143 -0
backend.py +173 -307
new_spacy → classifier_manager/__init__.py +0 -0
classifier_manager/__pycache__/__init__.cpython-313.pyc +0 -0
classifier_manager/__pycache__/gliner_model.cpython-313.pyc +0 -0
classifier_manager/__pycache__/inspector.cpython-313.pyc +0 -0
classifier_manager/__pycache__/presidio_model.cpython-313.pyc +0 -0
classifier_manager/__pycache__/regex_scanner.cpython-313.pyc +0 -0
classifier_manager/__pycache__/spacy_model.cpython-313.pyc +0 -0
classifier_manager/gliner_model.py +81 -0
inspector.py → classifier_manager/inspector.py +24 -8
presidio_model.py → classifier_manager/presidio_model.py +0 -0
classifier_manager/regex_scanner.py +44 -0
Spacy_model.py → classifier_manager/spacy_model.py +0 -0
connectors/__init__.py +0 -0
connectors/__pycache__/__init__.cpython-313.pyc +0 -0
connectors/__pycache__/aws_s3_handler.cpython-313.pyc +0 -0
connectors/__pycache__/azure_handler.cpython-313.pyc +0 -0
connectors/__pycache__/confluence_handler.cpython-313.pyc +0 -0
connectors/__pycache__/drive_handler.cpython-313.pyc +0 -0
connectors/__pycache__/gcp_storage_handler.cpython-313.pyc +0 -0
connectors/__pycache__/gmail_handler.cpython-313.pyc +0 -0
connectors/__pycache__/mongo_handler.cpython-313.pyc +0 -0
connectors/__pycache__/mysql_handler.cpython-313.pyc +0 -0
connectors/__pycache__/postgres_handler.cpython-313.pyc +0 -0
connectors/__pycache__/slack_handler.cpython-313.pyc +0 -0
connectors/aws_s3_handler.py +32 -0
connectors/azure_handler.py +32 -0
connectors/confluence_handler.py +44 -0
connectors/drive_handler.py +52 -0
connectors/gcp_storage_handler.py +37 -0
connectors/gmail_handler.py +77 -0
connectors/mongo_handler.py +45 -0
connectors/mysql_handler.py +23 -0
connectors/postgres_handler.py +23 -0
connectors/slack_handler.py +47 -0
file_handlers/__init__.py +0 -0
file_handlers/__pycache__/__init__.cpython-313.pyc +0 -0
file_handlers/__pycache__/avro_handler.cpython-313.pyc +0 -0
file_handlers/__pycache__/json_handler.cpython-313.pyc +0 -0
file_handlers/__pycache__/ocr_engine.cpython-313.pyc +0 -0
file_handlers/__pycache__/parquet_handler.cpython-313.pyc +0 -0
file_handlers/__pycache__/pdf_handler.cpython-313.pyc +0 -0
file_handlers/avro_handler.py +36 -0
file_handlers/json_handler.py +39 -0

__pycache__/avro_handler.cpython-313.pyc ADDED Viewed

Binary file (2 kB). View file

__pycache__/backend.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/backend.cpython-313.pyc and b/__pycache__/backend.cpython-313.pyc differ

__pycache__/gliner_model.cpython-313.pyc ADDED Viewed

Binary file (2.88 kB). View file

__pycache__/inspector.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/inspector.cpython-313.pyc and b/__pycache__/inspector.cpython-313.pyc differ

__pycache__/ocr_engine.cpython-313.pyc ADDED Viewed

Binary file (1.9 kB). View file

api.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# api.py
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from pydantic import BaseModel
+from typing import Optional, List
+import pandas as pd
+import io
+import json
+# Import your existing backend orchestrator
+from core.backend import RegexClassifier
+app = FastAPI(title="Segmento Sense API")
+# Initialize the Brain
+backend = RegexClassifier()
+# --- Pydantic Models for Requests ---
+class DbConnection(BaseModel):
+    type: str # postgres, mysql, mongo
+    host: str
+    port: str
+    user: str
+    password: str
+    database: str
+    collection: Optional[str] = None
+class CloudConnection(BaseModel):
+    service: str # aws, azure, gcp
+    key_1: str   # access_key or conn_string
+    key_2: Optional[str] = None # secret_key
+    region: Optional[str] = None
+    bucket: str
+    file_name: str
+class AppConnection(BaseModel):
+    service: str # gmail, slack, confluence
+    token_or_path: str # token or credentials.json content
+    target: str # channel_id, page_id, or num_emails
+# --- ENDPOINTS ---
+@app.get("/")
+def health_check():
+    return {"status": "Segmento Sense is running"}
+@app.post("/scan/file")
+async def scan_file(file: UploadFile = File(...)):
+    """
+    Handles PDF, CSV, JSON, Parquet, Avro, Image uploads.
+    """
+    file_bytes = await file.read()
+    filename = file.filename.lower()
+    df = pd.DataFrame()
+    raw_text = ""
+    # 1. Route to correct handler in backend.py
+    if filename.endswith(".pdf"):
+        # For demo, scan page 0
+        raw_text = backend.get_pdf_page_text(file_bytes, 0)
+        # Scan text
+        inspection = backend.run_full_inspection(raw_text)
+        matches = backend.analyze_text_hybrid(raw_text)
+        return {
+            "type": "unstructured",
+            "content": raw_text,
+            "matches": matches,
+            "stats": inspection.to_dict(orient="records")
+        }
+    elif filename.endswith((".png", ".jpg", ".jpeg")):
+        raw_text = backend.get_ocr_text_from_image(file_bytes)
+        inspection = backend.run_full_inspection(raw_text)
+        matches = backend.analyze_text_hybrid(raw_text)
+        return {
+            "type": "unstructured",
+            "content": raw_text,
+            "matches": matches,
+            "stats": inspection.to_dict(orient="records")
+        }
+    else:
+        # Structured Data
+        if filename.endswith(".csv"):
+            df = pd.read_csv(io.BytesIO(file_bytes))
+        elif filename.endswith(".json"):
+            df = backend.get_json_data(io.BytesIO(file_bytes))
+        elif filename.endswith(".parquet"):
+            df = backend.get_parquet_data(file_bytes)
+        elif filename.endswith(".avro"):
+            df = backend.get_avro_data(file_bytes)
+        # Get PII Counts
+        pii_counts = backend.get_pii_counts_dataframe(df)
+        masked_preview = backend.mask_dataframe(df.head(20))
+        return {
+            "type": "structured",
+            "pii_counts": pii_counts.to_dict(orient="records"),
+            "preview": masked_preview.to_dict(orient="records"),
+            "schema": backend.get_data_schema(df).to_dict(orient="records")
+        }
+@app.post("/scan/database")
+async def scan_db(conn: DbConnection):
+    df = pd.DataFrame()
+    if conn.type == "postgres":
+        df = backend.get_postgres_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
+    elif conn.type == "mysql":
+        df = backend.get_mysql_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
+    elif conn.type == "mongo":
+        df = backend.get_mongodb_data(conn.host, conn.port, conn.database, conn.user, conn.password, conn.collection)
+    if df.empty:
+        raise HTTPException(status_code=404, detail="Connection failed or no data found")
+    pii_counts = backend.get_pii_counts_dataframe(df)
+    return {
+        "source": conn.type,
+        "pii_counts": pii_counts.to_dict(orient="records"),
+        "preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records")
+    }
+@app.post("/scan/app")
+async def scan_app(conn: AppConnection):
+    df = pd.DataFrame()
+    if conn.service == "slack":
+        df = backend.get_slack_messages(conn.token_or_path, conn.target)
+    elif conn.service == "confluence":
+        # Split target "url|user|page_id" if needed or adjust model
+        # Simplified for demo: assuming backend handles auth
+        pass
+    if df.empty:
+        raise HTTPException(status_code=400, detail="No data fetched")
+    pii_counts = backend.get_pii_counts_dataframe(df)
+    return {
+        "source": conn.service,
+        "pii_counts": pii_counts.to_dict(orient="records"),
+        "preview": backend.mask_dataframe(df.head(10)).to_dict(orient="records")
+    }

backend.py CHANGED Viewed

@@ -1,68 +1,66 @@
-# backend.py
 import re
 import json
 import pandas as pd
 import fitz  # PyMuPDF
 import nltk
 import io
 from typing import Dict, List, Any
 from sqlalchemy import create_engine
 from urllib.parse import quote_plus
-# --- IMPORT MODULES ---
-from spacy_model import PiiSpacyAnalyzer
-from presidio_model import PiiPresidioAnalyzer
-from inspector import ModelInspector
 # --- DEPENDENCY CHECKS ---
 try:
     from googleapiclient.discovery import build
-    from googleapiclient.http import MediaIoBaseDownload
-    from google.oauth2 import service_account
     GOOGLE_AVAILABLE = True
 except ImportError:
     GOOGLE_AVAILABLE = False
-    print("Google Drive Libraries not installed.")
 try:
     import pymongo
     MONGO_AVAILABLE = True
-except ImportError:
-    MONGO_AVAILABLE = False
-    print("PyMongo not installed.")
-try:
-    import pyarrow
-    PARQUET_AVAILABLE = True
-except ImportError:
-    PARQUET_AVAILABLE = False
-    print("PyArrow not installed.")
 try:
     import boto3
     AWS_AVAILABLE = True
-except ImportError:
-    AWS_AVAILABLE = False
-    print("Boto3 not installed.")
 try:
     from azure.storage.blob import BlobServiceClient
     AZURE_AVAILABLE = True
-except ImportError:
-    AZURE_AVAILABLE = False
-    print("Azure Storage Blob not installed.")
-# --- GCP STORAGE IMPORT (NEW) ---
 try:
     from google.cloud import storage
-    # We reuse google.oauth2.service_account if available, else import it
-    from google.oauth2 import service_account as gcp_service_account
     GCS_AVAILABLE = True
-except ImportError:
-    GCS_AVAILABLE = False
-    print("Google Cloud Storage library not installed.")
-# --- NLTK SETUP ---
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
@@ -75,10 +73,9 @@ except LookupError:
 class RegexClassifier:
     def __init__(self):
         self.colors = {
-            "EMAIL": (136, 238, 255), "FIRST_NAME": (170, 255, 170), "LAST_NAME": (170, 255, 170),
-            "PHONE": (255, 170, 170), "SSN": (255, 204, 170), "CREDIT_CARD": (255, 238, 170),
-            "LOCATION": (200, 170, 255), "AADHAAR_IND": (255, 150, 255), "ORG": (255, 255, 150),
-            "DEFAULT": (224, 224, 224)
         }
         self.patterns: Dict[str, str] = {
@@ -90,69 +87,121 @@ class RegexClassifier:
             "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
         }
         self.spacy_analyzer = PiiSpacyAnalyzer()
         self.presidio_analyzer = PiiPresidioAnalyzer()
         self.inspector = ModelInspector()
     def list_patterns(self): return self.patterns
     def add_pattern(self, n, r): self.patterns[n.upper()] = r
     def remove_pattern(self, n): self.patterns.pop(n.upper(), None)
-    # --- DETECTION ENGINES ---
     def scan_with_regex(self, text: str) -> List[dict]:
         matches = []
         for label, regex in self.patterns.items():
-            for match in re.finditer(regex, text):
-                matches.append({"label": label, "text": match.group(), "start": match.start(), "end": match.end()})
         return matches
     def scan_with_nltk(self, text: str) -> List[dict]:
         detections = []
         try:
-            tokens = nltk.word_tokenize(text)
-            chunked = nltk.ne_chunk(nltk.pos_tag(tokens), binary=False)
-            current_pos = 0
-            for chunk in chunked:
                 if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'GPE']:
                     val = " ".join(c[0] for c in chunk)
-                    start_idx = text.find(val, current_pos)
-                    label = "LOCATION" if chunk.label() == 'GPE' else "FIRST_NAME"
-                    if start_idx != -1:
-                        detections.append({"label": label, "text": val, "start": start_idx, "end": start_idx + len(val)})
-                        current_pos = start_idx + len(val)
         except: pass
         return detections
     def analyze_text_hybrid(self, text: str) -> List[dict]:
         all_matches = []
         all_matches.extend(self.scan_with_regex(text))
         all_matches.extend(self.scan_with_nltk(text))
         all_matches.extend(self.spacy_analyzer.scan(text))
         all_matches.extend(self.presidio_analyzer.scan(text))
         all_matches.sort(key=lambda x: x['start'])
-        unique_matches = []
         if not all_matches: return []
         curr = all_matches[0]
-        for next_match in all_matches[1:]:
-            if next_match['start'] < curr['end']:
-                if len(next_match['text']) > len(curr['text']):
-                    curr = next_match
             else:
-                unique_matches.append(curr)
-                curr = next_match
-        unique_matches.append(curr)
-        return unique_matches
-    def run_full_inspection(self, text: str) -> pd.DataFrame:
-        r_matches = self.scan_with_regex(text)
-        n_matches = self.scan_with_nltk(text)
-        s_matches = self.spacy_analyzer.scan(text)
-        p_matches = self.presidio_analyzer.scan(text)
-        return self.inspector.compare_models(r_matches, n_matches, s_matches, p_matches)
-    # --- SUMMARY & VISUALS ---
     def get_pii_counts(self, text: str) -> pd.DataFrame:
         matches = self.analyze_text_hybrid(str(text))
         if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
@@ -160,261 +209,78 @@ class RegexClassifier:
         for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
         return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
-    def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
-        full_text = " ".join(df.astype(str).values.flatten())
-        return self.get_pii_counts(full_text)
-    def mask_pii(self, text: str) -> str:
-        text = str(text)
-        matches = self.analyze_text_hybrid(text)
-        matches.sort(key=lambda x: x['start'], reverse=True)
-        for m in matches:
-            masked_val = "******"
-            if "<span" not in text[m['start']:m['end']]:
-                text = text[:m['start']] + masked_val + text[m['end']:]
-        return text
     def mask_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
-        def safe_mask(val):
-            if isinstance(val, (list, dict, tuple, set)): return self.mask_pii(str(val))
-            if pd.isna(val): return val
-            return self.mask_pii(str(val))
-        return df.map(safe_mask)
-    def get_labeled_pdf_image(self, file_bytes, page_num: int):
-        try:
-            doc = fitz.open(stream=file_bytes, filetype="pdf")
-            if not (0 <= page_num < len(doc)): return None
-            page = doc[page_num]
-            text = page.get_text("text")
             matches = self.analyze_text_hybrid(text)
             for m in matches:
-                color_norm = tuple(c/255 for c in self.colors.get(m['label'], self.colors["DEFAULT"]))
-                quads = page.search_for(m['text'])
-                for quad in quads:
-                    page.draw_rect(quad, color=color_norm, fill=color_norm, fill_opacity=0.4)
-                    page.insert_text(fitz.Point(quad.x0, quad.y0-2), m['label'], fontsize=6, color=(0,0,0))
-            return page.get_pixmap(matrix=fitz.Matrix(2, 2)).tobytes("png")
-        except: return None
     def scan_dataframe_with_html(self, df: pd.DataFrame) -> pd.DataFrame:
-        def highlight_html(text):
             text = str(text)
             matches = self.analyze_text_hybrid(text)
             matches.sort(key=lambda x: x['start'], reverse=True)
-            hex_map = {"EMAIL": "#8ef", "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea", "FIRST_NAME": "#af9", "LAST_NAME": "#af9", "LOCATION": "#dcf", "AADHAAR_IND": "#f9f", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"}
             for m in matches:
                 if "<span" in text[m['start']:m['end']]: continue
-                color = hex_map.get(m['label'], "#e0e0e0")
-                tag = f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px; border: 1px solid #ccc;">{m["text"]}</span>'
-                text = text[:m['start']] + tag + text[m['end']:]
             return text
-        def safe_highlight(val):
-             if isinstance(val, (list, dict)): return highlight_html(str(val))
-             if pd.isna(val): return val
-             return highlight_html(val)
-        return df.map(safe_highlight)
-    def get_data_schema(self, df: pd.DataFrame) -> pd.DataFrame:
-        if df.empty: return pd.DataFrame(columns=["Column", "Type", "Sample"])
-        schema_info = []
-        for col in df.columns:
-            d_type = str(df[col].dtype)
-            first_valid_idx = df[col].first_valid_index()
-            sample_val = str(df[col].loc[first_valid_idx]) if first_valid_idx is not None else "All Null"
-            if len(sample_val) > 50: sample_val = sample_val[:47] + "..."
-            schema_info.append({"Column Name": col, "Data Type": d_type, "Sample Value": sample_val})
-        return pd.DataFrame(schema_info)
-    # --- SQL/MONGO/DRIVE/S3/AZURE CONNECTORS ---
     def get_postgres_data(self, host, port, db, user, pw, table):
-        safe_pw = quote_plus(pw)
-        conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}"
-        engine = create_engine(conn_str)
-        return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine)
     def get_mysql_data(self, host, port, db, user, pw, table):
-        safe_pw = quote_plus(pw)
-        conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}"
-        engine = create_engine(conn_str)
-        return pd.read_sql(f"SELECT * FROM {table} LIMIT 100", engine)
-    def get_mongodb_data(self, host, port, db, user, pw, collection):
-        if not MONGO_AVAILABLE: return pd.DataFrame()
-        try:
-            if user and pw:
-                safe_user = quote_plus(user)
-                safe_pw = quote_plus(pw)
-                uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/"
-            else:
-                uri = f"mongodb://{host}:{port}/"
-            client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
-            database = client[db]
-            col = database[collection]
-            cursor = col.find().limit(100)
-            data_list = list(cursor)
-            if not data_list: return pd.DataFrame()
-            for doc in data_list:
-                if '_id' in doc: doc['_id'] = str(doc['_id'])
-            return pd.json_normalize(data_list)
-        except Exception as e:
-            print(f"Mongo Error: {e}")
-            raise e
     def get_google_drive_files(self, credentials_dict):
-        if not GOOGLE_AVAILABLE: return []
-        try:
-            SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
-            creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES)
-            service = build('drive', 'v3', credentials=creds)
-            return service.files().list(pageSize=15, fields="files(id, name, mimeType)").execute().get('files', [])
-        except Exception as e:
-            return []
     def download_drive_file(self, file_id, mime_type, credentials_dict):
-        if not GOOGLE_AVAILABLE: return b""
-        try:
-            SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
-            creds = service_account.Credentials.from_service_account_info(credentials_dict, scopes=SCOPES)
-            service = build('drive', 'v3', credentials=creds)
-            if "spreadsheet" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='text/csv')
-            elif "document" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
-            elif "presentation" in mime_type: request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
-            else: request = service.files().get_media(fileId=file_id)
-            fh = io.BytesIO()
-            downloader = MediaIoBaseDownload(fh, request)
-            done = False
-            while done is False: status, done = downloader.next_chunk()
-            return fh.getvalue()
-        except: return b""
-    def get_s3_buckets(self, access_key, secret_key, region):
-        if not AWS_AVAILABLE: return []
-        try:
-            s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
-            response = s3.list_buckets()
-            return [b['Name'] for b in response.get('Buckets', [])]
-        except Exception as e:
-            print(f"S3 Error: {e}")
-            return []
-    def get_s3_files(self, access_key, secret_key, region, bucket_name):
-        if not AWS_AVAILABLE: return []
-        try:
-            s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
-            response = s3.list_objects_v2(Bucket=bucket_name)
-            return [obj['Key'] for obj in response.get('Contents', [])]
-        except Exception as e:
-            return []
-    def download_s3_file(self, access_key, secret_key, region, bucket_name, file_key):
-        if not AWS_AVAILABLE: return b""
-        try:
-            s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
-            obj = s3.get_object(Bucket=bucket_name, Key=file_key)
-            return obj['Body'].read()
-        except Exception as e:
-            return b""
-    def get_azure_containers(self, conn_str):
-        if not AZURE_AVAILABLE: return []
-        try:
-            blob_service_client = BlobServiceClient.from_connection_string(conn_str)
-            containers = blob_service_client.list_containers()
-            return [c['name'] for c in containers]
-        except Exception as e:
-            print(f"Azure Error: {e}")
-            return []
-    def get_azure_blobs(self, conn_str, container_name):
-        if not AZURE_AVAILABLE: return []
-        try:
-            blob_service_client = BlobServiceClient.from_connection_string(conn_str)
-            container_client = blob_service_client.get_container_client(container_name)
-            blobs = container_client.list_blobs()
-            return [b['name'] for b in blobs]
-        except Exception as e:
-            return []
-    def download_azure_blob(self, conn_str, container_name, blob_name):
-        if not AZURE_AVAILABLE: return b""
-        try:
-            blob_service_client = BlobServiceClient.from_connection_string(conn_str)
-            blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
-            return blob_client.download_blob().readall()
-        except Exception as e:
-            return b""
-    # --- GCP BUCKET CONNECTORS (NEW) ---
-    def get_gcs_buckets(self, credentials_dict):
-        """Lists all GCS buckets for the given service account credentials."""
-        if not GCS_AVAILABLE: return []
-        try:
-            # Create credentials object
-            credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
-            # Create storage client
-            storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
-            buckets = storage_client.list_buckets()
-            return [bucket.name for bucket in buckets]
-        except Exception as e:
-            print(f"GCP Bucket Error: {e}")
-            return []
-    def get_gcs_files(self, credentials_dict, bucket_name):
-        """Lists files (blobs) in a specific GCS bucket."""
-        if not GCS_AVAILABLE: return []
-        try:
-            credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
-            storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
-            blobs = storage_client.list_blobs(bucket_name)
-            return [blob.name for blob in blobs]
-        except Exception as e:
-            print(f"GCP List Error: {e}")
-            return []
-    def download_gcs_file(self, credentials_dict, bucket_name, blob_name):
-        """Downloads a blob from GCS to memory."""
-        if not GCS_AVAILABLE: return b""
-        try:
-            credentials = gcp_service_account.Credentials.from_service_account_info(credentials_dict)
-            storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
-            bucket = storage_client.bucket(bucket_name)
-            blob = bucket.blob(blob_name)
-            return blob.download_as_bytes()
-        except Exception as e:
-            print(f"GCP Download Error: {e}")
-            return b""
-    # --- FILE READERS ---
-    def get_json_data(self, file_obj) -> pd.DataFrame:
-        data = json.load(file_obj)
-        flat = []
-        def recursive(d, path):
-            if isinstance(d, dict):
-                for k, v in d.items(): recursive(v, f"{path}.{k}" if path else k)
-            elif isinstance(d, list):
-                for i, v in enumerate(d): recursive(v, f"{path}[{i}]")
-            else: flat.append({"Path": path, "Value": str(d)})
-        recursive(data, "")
-        return pd.DataFrame(flat)
-    def get_parquet_data(self, file_bytes) -> pd.DataFrame:
-        if not PARQUET_AVAILABLE: return pd.DataFrame()
-        try:
-            return pd.read_parquet(io.BytesIO(file_bytes))
-        except: return pd.DataFrame()
-    def get_pdf_total_pages(self, file_bytes) -> int:
-        try:
-            doc = fitz.open(stream=file_bytes, filetype="pdf")
-            return len(doc)
-        except: return 0
-    def get_pdf_page_text(self, file_bytes, page_num):
         try:
-            doc = fitz.open(stream=file_bytes, filetype="pdf")
-            return doc[page_num].get_text("text")
-        except: return ""

 import re
 import json
 import pandas as pd
 import fitz  # PyMuPDF
 import nltk
 import io
+import os
+import pickle
+import base64
 from typing import Dict, List, Any
 from sqlalchemy import create_engine
 from urllib.parse import quote_plus
+from bs4 import BeautifulSoup
+# --- IMPORT CLASSIFIERS ---
+from classifier_manager.spacy_model import PiiSpacyAnalyzer
+from classifier_manager.presidio_model import PiiPresidioAnalyzer
+from classifier_manager.gliner_model import PiiGlinerAnalyzer
+from classifier_manager.inspector import ModelInspector
+# --- IMPORT FILE HANDLERS ---
+from file_handlers.ocr_engine import OcrEngine
+from file_handlers.avro_handler import AvroHandler
+from file_handlers.parquet_handler import ParquetHandler
+from file_handlers.json_handler import JsonHandler
+from file_handlers.pdf_handler import PdfHandler
+# --- IMPORT CONNECTORS ---
+from connectors.postgres_handler import PostgresHandler
+from connectors.mysql_handler import MysqlHandler
+from connectors.gmail_handler import GmailHandler
+from connectors.drive_handler import DriveHandler
+from connectors.aws_s3_handler import S3Handler
+from connectors.azure_handler import AzureBlobHandler
+from connectors.gcp_storage_handler import GcpStorageHandler
+from connectors.slack_handler import SlackHandler           # <--- NEW
+from connectors.confluence_handler import ConfluenceHandler # <--- NEW
 # --- DEPENDENCY CHECKS ---
 try:
     from googleapiclient.discovery import build
     GOOGLE_AVAILABLE = True
 except ImportError:
     GOOGLE_AVAILABLE = False
+    print("Google Libraries not installed.")
 try:
     import pymongo
     MONGO_AVAILABLE = True
+except: MONGO_AVAILABLE = False
 try:
     import boto3
     AWS_AVAILABLE = True
+except: AWS_AVAILABLE = False
 try:
     from azure.storage.blob import BlobServiceClient
     AZURE_AVAILABLE = True
+except: AZURE_AVAILABLE = False
 try:
     from google.cloud import storage
     GCS_AVAILABLE = True
+except: GCS_AVAILABLE = False
+# NLTK Setup
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
 class RegexClassifier:
     def __init__(self):
         self.colors = {
+            "EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9",
+            "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea",
+            "LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"
         }
         self.patterns: Dict[str, str] = {
             "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
         }
+        # 1. Classifiers
         self.spacy_analyzer = PiiSpacyAnalyzer()
         self.presidio_analyzer = PiiPresidioAnalyzer()
+        self.gliner_analyzer = PiiGlinerAnalyzer()
         self.inspector = ModelInspector()
+        # 2. File Handlers
+        self.ocr_engine = OcrEngine()
+        self.avro_handler = AvroHandler()
+        self.parquet_handler = ParquetHandler()
+        self.json_handler = JsonHandler()
+        self.pdf_handler = PdfHandler(self.ocr_engine)
+        # 3. Connectors
+        self.pg_handler = PostgresHandler()
+        self.mysql_handler = MysqlHandler()
+        self.gmail_handler = GmailHandler()
+        self.drive_handler = DriveHandler()
+        self.s3_handler = S3Handler()
+        self.azure_handler = AzureBlobHandler()
+        self.gcp_handler = GcpStorageHandler()
+        self.slack_handler = SlackHandler()           # <--- Init
+        self.confluence_handler = ConfluenceHandler() # <--- Init
     def list_patterns(self): return self.patterns
     def add_pattern(self, n, r): self.patterns[n.upper()] = r
     def remove_pattern(self, n): self.patterns.pop(n.upper(), None)
+    # --- CORE ANALYSIS ---
     def scan_with_regex(self, text: str) -> List[dict]:
         matches = []
         for label, regex in self.patterns.items():
+            for m in re.finditer(regex, text):
+                matches.append({"label": label, "text": m.group(), "start": m.start(), "end": m.end(), "source": "Regex"})
         return matches
     def scan_with_nltk(self, text: str) -> List[dict]:
         detections = []
         try:
+            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
                 if hasattr(chunk, 'label') and chunk.label() in ['PERSON', 'GPE']:
                     val = " ".join(c[0] for c in chunk)
+                    start = text.find(val)
+                    if start != -1:
+                        detections.append({
+                            "label": "LOCATION" if chunk.label() == 'GPE' else "FIRST_NAME",
+                            "text": val, "start": start, "end": start+len(val), "source": "NLTK"
+                        })
         except: pass
         return detections
     def analyze_text_hybrid(self, text: str) -> List[dict]:
+        if not text: return []
         all_matches = []
         all_matches.extend(self.scan_with_regex(text))
         all_matches.extend(self.scan_with_nltk(text))
         all_matches.extend(self.spacy_analyzer.scan(text))
         all_matches.extend(self.presidio_analyzer.scan(text))
+        all_matches.extend(self.gliner_analyzer.scan(text))
         all_matches.sort(key=lambda x: x['start'])
+        unique = []
         if not all_matches: return []
         curr = all_matches[0]
+        for next_m in all_matches[1:]:
+            if next_m['start'] < curr['end']:
+                if len(next_m['text']) > len(curr['text']):
+                    curr = next_m
             else:
+                unique.append(curr)
+                curr = next_m
+        unique.append(curr)
+        return unique
+    def run_full_inspection(self, text: str):
+        return self.inspector.compare_models(
+            self.scan_with_regex(text),
+            self.scan_with_nltk(text),
+            self.spacy_analyzer.scan(text),
+            self.presidio_analyzer.scan(text),
+            self.gliner_analyzer.scan(text)
+        )
+    # --- WRAPPERS FOR UI ---
+    def get_json_data(self, file_obj) -> pd.DataFrame:
+        return self.json_handler.read_file(file_obj)
+    def get_pdf_page_text(self, file_bytes, page_num):
+        return self.pdf_handler.get_page_text(file_bytes, page_num)
+    def get_pdf_total_pages(self, file_bytes) -> int:
+        return self.pdf_handler.get_total_pages(file_bytes)
+    def get_labeled_pdf_image(self, file_bytes, page_num):
+        text = self.get_pdf_page_text(file_bytes, page_num)
+        matches = self.analyze_text_hybrid(text)
+        return self.pdf_handler.render_labeled_image(file_bytes, page_num, matches, self.colors)
+    def get_avro_data(self, file_bytes) -> pd.DataFrame:
+        return self.avro_handler.convert_to_dataframe(file_bytes)
+    def get_parquet_data(self, file_bytes) -> pd.DataFrame:
+        return self.parquet_handler.convert_to_dataframe(file_bytes)
+    def get_ocr_text_from_image(self, file_bytes) -> str:
+        return self.ocr_engine.extract_text(file_bytes)
+    def get_pii_counts_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        text = " ".join(df.astype(str).values.flatten())
+        matches = self.analyze_text_hybrid(str(text))
+        if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
+        counts = {}
+        for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
+        return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
     def get_pii_counts(self, text: str) -> pd.DataFrame:
         matches = self.analyze_text_hybrid(str(text))
         if not matches: return pd.DataFrame(columns=["PII Type", "Count"])
         for m in matches: counts[m['label']] = counts.get(m['label'], 0) + 1
         return pd.DataFrame(list(counts.items()), columns=["PII Type", "Count"])
     def mask_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        def mask_text(text):
+            text = str(text)
             matches = self.analyze_text_hybrid(text)
+            matches.sort(key=lambda x: x['start'], reverse=True)
             for m in matches:
+                if "***" not in text[m['start']:m['end']]:
+                    text = text[:m['start']] + "******" + text[m['end']:]
+            return text
+        return df.map(lambda x: mask_text(x) if isinstance(x, (str, int, float)) else x)
     def scan_dataframe_with_html(self, df: pd.DataFrame) -> pd.DataFrame:
+        def highlight(text):
             text = str(text)
             matches = self.analyze_text_hybrid(text)
             matches.sort(key=lambda x: x['start'], reverse=True)
             for m in matches:
                 if "<span" in text[m['start']:m['end']]: continue
+                color = self.colors.get(m['label'], self.colors["DEFAULT"])
+                replacement = f'<span style="background:{color}; padding:2px; border-radius:4px;">{m["text"]}</span>'
+                text = text[:m['start']] + replacement + text[m['end']:]
             return text
+        return df.map(lambda x: highlight(x) if isinstance(x, str) else x)
+    def get_data_schema(self, df):
+        return pd.DataFrame({"Column": df.columns, "Type": df.dtypes.astype(str)})
+    # --- CONNECTOR WRAPPERS ---
     def get_postgres_data(self, host, port, db, user, pw, table):
+        return self.pg_handler.fetch_data(host, port, db, user, pw, table)
     def get_mysql_data(self, host, port, db, user, pw, table):
+        return self.mysql_handler.fetch_data(host, port, db, user, pw, table)
+    def get_gmail_data(self, credentials_file, num_emails=10) -> pd.DataFrame:
+        return self.gmail_handler.fetch_emails(credentials_file, num_emails)
     def get_google_drive_files(self, credentials_dict):
+        return self.drive_handler.list_files(credentials_dict)
     def download_drive_file(self, file_id, mime_type, credentials_dict):
+        return self.drive_handler.download_file(file_id, mime_type, credentials_dict)
+    def get_s3_buckets(self, a, s, r): return self.s3_handler.get_buckets(a, s, r)
+    def get_s3_files(self, a, s, r, b): return self.s3_handler.get_files(a, s, r, b)
+    def download_s3_file(self, a, s, r, b, k): return self.s3_handler.download_file(a, s, r, b, k)
+    def get_azure_containers(self, c): return self.azure_handler.get_containers(c)
+    def get_azure_blobs(self, c, n): return self.azure_handler.get_blobs(c, n)
+    def download_azure_blob(self, c, n, b): return self.azure_handler.download_blob(c, n, b)
+    def get_gcs_buckets(self, c): return self.gcp_handler.get_buckets(c)
+    def get_gcs_files(self, c, b): return self.gcp_handler.get_files(c, b)
+    def download_gcs_file(self, c, b, n): return self.gcp_handler.download_file(c, b, n)
+    # --- NEW WRAPPERS FOR SLACK & CONFLUENCE ---
+    def get_slack_messages(self, token, channel_id):
+        return self.slack_handler.fetch_messages(token, channel_id)
+    def get_confluence_page(self, url, username, token, page_id):
+        return self.confluence_handler.fetch_page_content(url, username, token, page_id)
+    # --- MONGO (Still here) ---
+    def get_mongodb_data(self, host, port, db, user, pw, collection):
+        if not MONGO_AVAILABLE: return pd.DataFrame()
         try:
+            if user and pw: uri = f"mongodb://{quote_plus(user)}:{quote_plus(pw)}@{host}:{port}/"
+            else: uri = f"mongodb://{host}:{port}/"
+            client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
+            cursor = client[db][collection].find().limit(100)
+            data = list(cursor)
+            if not data: return pd.DataFrame()
+            for d in data: d['_id'] = str(d.get('_id', ''))
+            return pd.json_normalize(data)
+        except: return pd.DataFrame()

new_spacy → classifier_manager/__init__.py RENAMED Viewed

File without changes

classifier_manager/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (206 Bytes). View file

classifier_manager/__pycache__/gliner_model.cpython-313.pyc ADDED Viewed

Binary file (2.9 kB). View file

classifier_manager/__pycache__/inspector.cpython-313.pyc ADDED Viewed

Binary file (3.8 kB). View file

classifier_manager/__pycache__/presidio_model.cpython-313.pyc ADDED Viewed

Binary file (2.97 kB). View file

classifier_manager/__pycache__/regex_scanner.cpython-313.pyc ADDED Viewed

Binary file (2.51 kB). View file

classifier_manager/__pycache__/spacy_model.cpython-313.pyc ADDED Viewed

Binary file (2.85 kB). View file

classifier_manager/gliner_model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from gliner import GLiNER
+class PiiGlinerAnalyzer:
+    def __init__(self, model_name="urchade/gliner_small-v2.1"):
+        """
+        Initializes the GLiNER model.
+        Uses a small, efficient BERT-based model by default.
+        """
+        self.model = None
+        self.available = False
+        # Define the natural language labels you want GLiNER to look for.
+        # These are used as prompts for the model.
+        self.labels = [
+            "person",
+            "email",
+            "phone number",
+            "credit card",
+            "social security number",
+            "organization",
+            "location",
+            "date",
+            "ip address",
+            "passport number",
+            "driver license"
+        ]
+        try:
+            print(f"⏳ Loading GLiNER model: {model_name}...")
+            # This will download the model to your local cache on the first run
+            self.model = GLiNER.from_pretrained(model_name)
+            self.available = True
+            print("✅ GLiNER model loaded successfully.")
+        except Exception as e:
+            print(f"❌ Error loading GLiNER: {e}")
+    def scan(self, text: str) -> list:
+        """
+        Scans text using GLiNER and normalizes the output for the Inspector.
+        """
+        if not self.available or not text or not text.strip():
+            return []
+        try:
+            # GLiNER takes text and a list of labels as input
+            # Threshold 0.5 is a good balance for the small model
+            entities = self.model.predict_entities(text, self.labels, threshold=0.5)
+            detections = []
+            # Map GLiNER's lowercase output labels to your App's standard uppercase keys
+            # to ensure consistency in the UI and Inspector.
+            label_map = {
+                "person": "FIRST_NAME",
+                "phone number": "PHONE",
+                "social security number": "SSN",
+                "organization": "ORG",
+                "location": "LOCATION",
+                "ip address": "IP_ADDRESS",
+                "credit card": "CREDIT_CARD",
+                "email": "EMAIL",
+                "date": "DATE_TIME",
+                "passport number": "PASSPORT",
+                "driver license": "DRIVER_LICENSE"
+            }
+            for ent in entities:
+                detections.append({
+                    "label": label_map.get(ent["label"], ent["label"].upper().replace(" ", "_")),
+                    "text": ent["text"],
+                    "start": ent["start"],
+                    "end": ent["end"],
+                    "score": ent["score"],
+                    "source": "GLiNER" # Helpful metadata
+                })
+            return detections
+        except Exception as e:
+            print(f"⚠️ GLiNER Scan Error: {e}")
+            return []

inspector.py → classifier_manager/inspector.py RENAMED Viewed

@@ -12,16 +12,17 @@ class ModelInspector:
             "end": match["end"]
         }
-    def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches):
         """
-        Compares 4 lists of matches to find Unique vs Missed PII.
         """
         all_detections = {}
         def add_to_master(matches, model_name):
             found_set = set()
             for m in matches:
-                # Use tuple key for uniqueness
                 key = (m['start'], m['end'], m['text'])
                 if key not in all_detections:
                     all_detections[key] = {'text': m['text'], 'label': m['label']}
@@ -32,19 +33,26 @@ class ModelInspector:
         regex_set = add_to_master(regex_matches, "Regex")
         nltk_set = add_to_master(nltk_matches, "NLTK")
         spacy_set = add_to_master(spacy_matches, "SpaCy")
-        presidio_set = add_to_master(presidio_matches, "Presidio") # <--- Added Presidio
-        # 2. Calculate "Missed" Data
         total_unique_pii = set(all_detections.keys())
         regex_missed = total_unique_pii - regex_set
         nltk_missed = total_unique_pii - nltk_set
         spacy_missed = total_unique_pii - spacy_set
-        presidio_missed = total_unique_pii - presidio_set # <--- Added Presidio
         def fmt(item_set):
             items = [all_detections[k]['text'] for k in item_set]
-            return ", ".join(items) if items else "None"
         total_count = len(total_unique_pii) if len(total_unique_pii) > 0 else 1
@@ -76,7 +84,15 @@ class ModelInspector:
                 "Missed PII": fmt(presidio_missed),
                 "Accuracy": len(presidio_set) / total_count,
                 "Count": len(presidio_set)
             }
         ]
-        return pd.DataFrame(stats)

             "end": match["end"]
         }
+    def compare_models(self, regex_matches, nltk_matches, spacy_matches, presidio_matches, gliner_matches):
         """
+        Compares 5 lists of matches to find Unique vs Missed PII.
+        Added GLiNER to the comparison logic.
         """
         all_detections = {}
         def add_to_master(matches, model_name):
             found_set = set()
             for m in matches:
+                # Use tuple key for uniqueness: (start, end, text)
                 key = (m['start'], m['end'], m['text'])
                 if key not in all_detections:
                     all_detections[key] = {'text': m['text'], 'label': m['label']}
         regex_set = add_to_master(regex_matches, "Regex")
         nltk_set = add_to_master(nltk_matches, "NLTK")
         spacy_set = add_to_master(spacy_matches, "SpaCy")
+        presidio_set = add_to_master(presidio_matches, "Presidio")
+        gliner_set = add_to_master(gliner_matches, "GLiNER") # <--- Added GLiNER
+        # 2. Calculate "Missed" Data (Union of all models)
         total_unique_pii = set(all_detections.keys())
         regex_missed = total_unique_pii - regex_set
         nltk_missed = total_unique_pii - nltk_set
         spacy_missed = total_unique_pii - spacy_set
+        presidio_missed = total_unique_pii - presidio_set
+        gliner_missed = total_unique_pii - gliner_set # <--- Added GLiNER
         def fmt(item_set):
             items = [all_detections[k]['text'] for k in item_set]
+            # Limiting to first 5 items to prevent UI clutter if list is huge
+            display_items = items[:5]
+            res = ", ".join(display_items)
+            if len(items) > 5:
+                res += f", (+{len(items)-5} more)"
+            return res if res else "None"
         total_count = len(total_unique_pii) if len(total_unique_pii) > 0 else 1
                 "Missed PII": fmt(presidio_missed),
                 "Accuracy": len(presidio_set) / total_count,
                 "Count": len(presidio_set)
+            },
+            {
+                "Model": "🦅 GLiNER",
+                "Detected PII": fmt(gliner_set),
+                "Missed PII": fmt(gliner_missed),
+                "Accuracy": len(gliner_set) / total_count,
+                "Count": len(gliner_set)
             }
         ]
+        # Return sorted by Accuracy descending so best model is on top
+        return pd.DataFrame(stats).sort_values(by="Accuracy", ascending=False)

presidio_model.py → classifier_manager/presidio_model.py RENAMED Viewed

File without changes

classifier_manager/regex_scanner.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import re
+from typing import Dict, List
+class RegexScanner:
+    def __init__(self):
+        self.colors = {
+            "EMAIL": "#8ef", "FIRST_NAME": "#af9", "LAST_NAME": "#af9",
+            "PHONE": "#faa", "SSN": "#fca", "CREDIT_CARD": "#fea",
+            "LOCATION": "#dcf", "ORG": "#ffecb3", "DEFAULT": "#e0e0e0"
+        }
+        self.patterns: Dict[str, str] = {
+            "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
+            "PHONE": r"\b(?:\+?1[-. ]?)?\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})\b",
+            "SSN": r"\b\d{3}-\d{2}-\d{4}\b",
+            "CREDIT_CARD": r"\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b",
+            "AADHAAR_IND": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b",
+            "PAN_IND": r"\b[A-Z]{5}\d{4}[A-Z]{1}\b",
+        }
+    def add_pattern(self, name, regex):
+        self.patterns[name.upper()] = regex
+    def remove_pattern(self, name):
+        self.patterns.pop(name.upper(), None)
+    def scan(self, text: str) -> List[dict]:
+        """
+        Scans text using defined Regex patterns.
+        """
+        matches = []
+        for label, regex in self.patterns.items():
+            try:
+                for m in re.finditer(regex, text):
+                    matches.append({
+                        "label": label,
+                        "text": m.group(),
+                        "start": m.start(),
+                        "end": m.end(),
+                        "source": "Regex"
+                    })
+            except re.error:
+                continue # Skip invalid user-defined regex
+        return matches

Spacy_model.py → classifier_manager/spacy_model.py RENAMED Viewed

File without changes

connectors/__init__.py ADDED Viewed

File without changes

connectors/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (198 Bytes). View file

connectors/__pycache__/aws_s3_handler.cpython-313.pyc ADDED Viewed

Binary file (2.55 kB). View file

connectors/__pycache__/azure_handler.cpython-313.pyc ADDED Viewed

Binary file (2.55 kB). View file

connectors/__pycache__/confluence_handler.cpython-313.pyc ADDED Viewed

Binary file (2.02 kB). View file

connectors/__pycache__/drive_handler.cpython-313.pyc ADDED Viewed

Binary file (3.18 kB). View file

connectors/__pycache__/gcp_storage_handler.cpython-313.pyc ADDED Viewed

Binary file (3.11 kB). View file

connectors/__pycache__/gmail_handler.cpython-313.pyc ADDED Viewed

Binary file (5.2 kB). View file

connectors/__pycache__/mongo_handler.cpython-313.pyc ADDED Viewed

Binary file (2.32 kB). View file

connectors/__pycache__/mysql_handler.cpython-313.pyc ADDED Viewed

Binary file (1.55 kB). View file

connectors/__pycache__/postgres_handler.cpython-313.pyc ADDED Viewed

Binary file (1.58 kB). View file

connectors/__pycache__/slack_handler.cpython-313.pyc ADDED Viewed

Binary file (2.6 kB). View file

connectors/aws_s3_handler.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import boto3
+import io
+class S3Handler:
+    def __init__(self):
+        print("✅ AWS S3 Handler loaded.")
+    def get_buckets(self, access_key, secret_key, region):
+        try:
+            s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
+            response = s3.list_buckets()
+            return [b['Name'] for b in response.get('Buckets', [])]
+        except Exception as e:
+            print(f"❌ S3 Error: {e}")
+            return []
+    def get_files(self, access_key, secret_key, region, bucket_name):
+        try:
+            s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
+            response = s3.list_objects_v2(Bucket=bucket_name)
+            return [obj['Key'] for obj in response.get('Contents', [])]
+        except Exception as e:
+            return []
+    def download_file(self, access_key, secret_key, region, bucket_name, file_key):
+        try:
+            s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region)
+            obj = s3.get_object(Bucket=bucket_name, Key=file_key)
+            return obj['Body'].read()
+        except Exception as e:
+            print(f"❌ S3 Download Error: {e}")
+            return b""

connectors/azure_handler.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from azure.storage.blob import BlobServiceClient
+class AzureBlobHandler:
+    def __init__(self):
+        print("✅ Azure Blob Handler loaded.")
+    def get_containers(self, conn_str):
+        try:
+            blob_service_client = BlobServiceClient.from_connection_string(conn_str)
+            containers = blob_service_client.list_containers()
+            return [c['name'] for c in containers]
+        except Exception as e:
+            print(f"❌ Azure Error: {e}")
+            return []
+    def get_blobs(self, conn_str, container_name):
+        try:
+            blob_service_client = BlobServiceClient.from_connection_string(conn_str)
+            container_client = blob_service_client.get_container_client(container_name)
+            blobs = container_client.list_blobs()
+            return [b['name'] for b in blobs]
+        except Exception as e:
+            return []
+    def download_blob(self, conn_str, container_name, blob_name):
+        try:
+            blob_service_client = BlobServiceClient.from_connection_string(conn_str)
+            blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
+            return blob_client.download_blob().readall()
+        except Exception as e:
+            print(f"❌ Azure Download Error: {e}")
+            return b""

connectors/confluence_handler.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import pandas as pd
+from atlassian import Confluence
+from bs4 import BeautifulSoup
+class ConfluenceHandler:
+    def __init__(self):
+        print("✅ Confluence Handler loaded.")
+    def fetch_page_content(self, url, username, api_token, page_id):
+        """
+        Fetches the body content of a specific Confluence page.
+        """
+        try:
+            # Initialize Confluence API
+            confluence = Confluence(
+                url=url,
+                username=username,
+                password=api_token,
+                cloud=True
+            )
+            # Get Page Content
+            page = confluence.get_page_by_id(page_id, expand='body.storage')
+            title = page.get('title', 'Unknown Title')
+            # Extract HTML body
+            raw_html = page.get('body', {}).get('storage', {}).get('value', '')
+            # Clean HTML tags to get raw text for PII scanning
+            if raw_html:
+                clean_text = BeautifulSoup(raw_html, "html.parser").get_text(separator=' ')
+            else:
+                clean_text = ""
+            return pd.DataFrame([{
+                "Source": "Confluence",
+                "Sender": username,
+                "Subject": title,
+                "Content": clean_text
+            }])
+        except Exception as e:
+            print(f"❌ Confluence Error: {e}")
+            return pd.DataFrame()

connectors/drive_handler.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import io
+import json
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+from google.oauth2 import service_account
+class DriveHandler:
+    def __init__(self):
+        print("✅ Google Drive Handler loaded.")
+    def list_files(self, credentials_dict):
+        try:
+            creds = service_account.Credentials.from_service_account_info(
+                credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly']
+            )
+            service = build('drive', 'v3', credentials=creds)
+            results = service.files().list(
+                pageSize=15, fields="files(id, name, mimeType)"
+            ).execute()
+            return results.get('files', [])
+        except Exception as e:
+            print(f"❌ Drive List Error: {e}")
+            return []
+    def download_file(self, file_id, mime_type, credentials_dict) -> bytes:
+        try:
+            creds = service_account.Credentials.from_service_account_info(
+                credentials_dict, scopes=['https://www.googleapis.com/auth/drive.readonly']
+            )
+            service = build('drive', 'v3', credentials=creds)
+            # Export Google Docs to standard formats
+            if "spreadsheet" in mime_type:
+                request = service.files().export_media(fileId=file_id, mimeType='text/csv')
+            elif "document" in mime_type:
+                request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
+            elif "presentation" in mime_type:
+                request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
+            else:
+                # Download binary files directly
+                request = service.files().get_media(fileId=file_id)
+            fh = io.BytesIO()
+            downloader = MediaIoBaseDownload(fh, request)
+            done = False
+            while done is False:
+                status, done = downloader.next_chunk()
+            return fh.getvalue()
+        except Exception as e:
+            print(f"❌ Drive Download Error: {e}")
+            return b""

connectors/gcp_storage_handler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from google.cloud import storage
+from google.oauth2 import service_account
+class GcpStorageHandler:
+    def __init__(self):
+        print("✅ GCP Storage Handler loaded.")
+    def get_buckets(self, credentials_dict):
+        try:
+            credentials = service_account.Credentials.from_service_account_info(credentials_dict)
+            storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
+            buckets = storage_client.list_buckets()
+            return [bucket.name for bucket in buckets]
+        except Exception as e:
+            print(f"❌ GCP Bucket Error: {e}")
+            return []
+    def get_files(self, credentials_dict, bucket_name):
+        try:
+            credentials = service_account.Credentials.from_service_account_info(credentials_dict)
+            storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
+            blobs = storage_client.list_blobs(bucket_name)
+            return [blob.name for blob in blobs]
+        except Exception as e:
+            print(f"❌ GCP List Error: {e}")
+            return []
+    def download_file(self, credentials_dict, bucket_name, blob_name):
+        try:
+            credentials = service_account.Credentials.from_service_account_info(credentials_dict)
+            storage_client = storage.Client(credentials=credentials, project=credentials_dict.get('project_id'))
+            bucket = storage_client.bucket(bucket_name)
+            blob = bucket.blob(blob_name)
+            return blob.download_as_bytes()
+        except Exception as e:
+            print(f"❌ GCP Download Error: {e}")
+            return b""

connectors/gmail_handler.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import base64
+import os
+import pickle
+import pandas as pd
+from bs4 import BeautifulSoup
+from googleapiclient.discovery import build
+from google_auth_oauthlib.flow import InstalledAppFlow
+from google.auth.transport.requests import Request
+class GmailHandler:
+    def __init__(self):
+        print("✅ Gmail Handler loaded.")
+    def fetch_emails(self, credentials_file, num_emails=10) -> pd.DataFrame:
+        """
+        Authenticates and fetches emails from Gmail.
+        """
+        try:
+            SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
+            creds = None
+            token_path = 'token.pickle'
+            if os.path.exists(token_path):
+                with open(token_path, 'rb') as token:
+                    creds = pickle.load(token)
+            if not creds or not creds.valid:
+                if creds and creds.expired and creds.refresh_token:
+                    creds.refresh(Request())
+                else:
+                    # Write temp file because flow requires file path
+                    with open("temp_client_secret.json", "wb") as f:
+                        f.write(credentials_file.getvalue())
+                    flow = InstalledAppFlow.from_client_secrets_file('temp_client_secret.json', SCOPES)
+                    creds = flow.run_local_server(port=0)
+                    with open(token_path, 'wb') as token:
+                        pickle.dump(creds, token)
+                    if os.path.exists("temp_client_secret.json"):
+                        os.remove("temp_client_secret.json")
+            service = build('gmail', 'v1', credentials=creds)
+            results = service.users().messages().list(userId='me', maxResults=num_emails).execute()
+            messages = results.get('messages', [])
+            email_data = []
+            for message in messages:
+                msg = service.users().messages().get(userId='me', id=message['id']).execute()
+                payload = msg['payload']
+                headers = payload.get("headers")
+                subject = next((h['value'] for h in headers if h['name'] == 'Subject'), "No Subject")
+                sender = next((h['value'] for h in headers if h['name'] == 'From'), "Unknown")
+                body = ""
+                if 'parts' in payload:
+                    for part in payload['parts']:
+                        if part['mimeType'] == 'text/plain' and 'data' in part['body']:
+                            body += base64.urlsafe_b64decode(part['body']['data']).decode()
+                elif 'body' in payload and 'data' in payload['body']:
+                     body += base64.urlsafe_b64decode(payload['body']['data']).decode()
+                clean_body = BeautifulSoup(body, "html.parser").get_text()
+                email_data.append({
+                    "Source": "Gmail",
+                    "Sender": sender,
+                    "Subject": subject,
+                    "Content": f"Subject: {subject}\n\n{clean_body}"
+                })
+            return pd.DataFrame(email_data)
+        except Exception as e:
+            print(f"❌ Gmail Error: {e}")
+            return pd.DataFrame()

connectors/mongo_handler.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import pandas as pd
+from urllib.parse import quote_plus
+class MongoHandler:
+    def __init__(self):
+        try:
+            import pymongo
+            self.pymongo = pymongo
+            print("✅ MongoDB Handler loaded.")
+        except ImportError:
+            self.pymongo = None
+            print("❌ PyMongo not installed.")
+    def fetch_data(self, host, port, db, user, pw, collection):
+        if not self.pymongo:
+            return pd.DataFrame()
+        try:
+            if user and pw:
+                safe_user = quote_plus(user)
+                safe_pw = quote_plus(pw)
+                uri = f"mongodb://{safe_user}:{safe_pw}@{host}:{port}/"
+            else:
+                uri = f"mongodb://{host}:{port}/"
+            client = self.pymongo.MongoClient(uri, serverSelectionTimeoutMS=5000)
+            # Check connection
+            client.server_info()
+            cursor = client[db][collection].find().limit(100)
+            data = list(cursor)
+            if not data:
+                return pd.DataFrame()
+            # Normalize ObjectIds to strings
+            for d in data:
+                if '_id' in d:
+                    d['_id'] = str(d['_id'])
+            return pd.json_normalize(data)
+        except Exception as e:
+            print(f"❌ Mongo Error: {e}")
+            return pd.DataFrame()

connectors/mysql_handler.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pandas as pd
+from sqlalchemy import create_engine
+from urllib.parse import quote_plus
+class MysqlHandler:
+    def __init__(self):
+        print("✅ MySQL Handler loaded.")
+    def fetch_data(self, host, port, db, user, pw, table):
+        """
+        Connects to MySQL and fetches the first 100 rows of a table.
+        """
+        try:
+            safe_pw = quote_plus(pw)
+            # Uses mysql+pymysql driver
+            conn_str = f"mysql+pymysql://{user}:{safe_pw}@{host}:{port}/{db}"
+            engine = create_engine(conn_str)
+            query = f"SELECT * FROM {table} LIMIT 100"
+            return pd.read_sql(query, engine)
+        except Exception as e:
+            print(f"❌ MySQL Error: {e}")
+            return pd.DataFrame()

connectors/postgres_handler.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pandas as pd
+from sqlalchemy import create_engine
+from urllib.parse import quote_plus
+class PostgresHandler:
+    def __init__(self):
+        print("✅ PostgreSQL Handler loaded.")
+    def fetch_data(self, host, port, db, user, pw, table):
+        """
+        Connects to PostgreSQL and fetches the first 100 rows of a table.
+        """
+        try:
+            safe_pw = quote_plus(pw)
+            # SQLAlchemy connection string
+            conn_str = f"postgresql://{user}:{safe_pw}@{host}:{port}/{db}"
+            engine = create_engine(conn_str)
+            query = f"SELECT * FROM {table} LIMIT 100"
+            return pd.read_sql(query, engine)
+        except Exception as e:
+            print(f"❌ PostgreSQL Error: {e}")
+            return pd.DataFrame()

connectors/slack_handler.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import pandas as pd
+from slack_sdk import WebClient
+from slack_sdk.errors import SlackApiError
+import datetime
+class SlackHandler:
+    def __init__(self):
+        print("✅ Slack Handler loaded.")
+    def fetch_messages(self, token, channel_id, num_messages=20):
+        """
+        Fetches recent messages from a specific Slack channel.
+        """
+        try:
+            client = WebClient(token=token)
+            # Fetch conversation history
+            response = client.conversations_history(channel=channel_id, limit=num_messages)
+            messages = []
+            if response['ok']:
+                for msg in response['messages']:
+                    # Skip subtypes like 'channel_join', only process actual text
+                    if 'subtype' not in msg:
+                        user_id = msg.get('user', 'Unknown')
+                        text = msg.get('text', '')
+                        ts = float(msg.get('ts', 0))
+                        time_str = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
+                        messages.append({
+                            "Source": "Slack",
+                            "Sender": user_id,
+                            "Subject": f"Message in {channel_id} at {time_str}",
+                            "Content": text
+                        })
+            if not messages:
+                print("⚠️ No messages found in channel.")
+                return pd.DataFrame()
+            return pd.DataFrame(messages)
+        except SlackApiError as e:
+            print(f"❌ Slack API Error: {e.response['error']}")
+            return pd.DataFrame()
+        except Exception as e:
+            print(f"❌ Slack Handler Error: {e}")
+            return pd.DataFrame()

file_handlers/__init__.py ADDED Viewed

File without changes

file_handlers/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (201 Bytes). View file

file_handlers/__pycache__/avro_handler.cpython-313.pyc ADDED Viewed

Binary file (2.01 kB). View file

file_handlers/__pycache__/json_handler.cpython-313.pyc ADDED Viewed

Binary file (2.37 kB). View file

file_handlers/__pycache__/ocr_engine.cpython-313.pyc ADDED Viewed

Binary file (1.92 kB). View file

file_handlers/__pycache__/parquet_handler.cpython-313.pyc ADDED Viewed

Binary file (1.76 kB). View file

file_handlers/__pycache__/pdf_handler.cpython-313.pyc ADDED Viewed

Binary file (3.8 kB). View file

file_handlers/avro_handler.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# avro_handler.py
+import io
+import pandas as pd
+class AvroHandler:
+    def __init__(self):
+        self.available = False
+        try:
+            import fastavro
+            self.fastavro = fastavro
+            self.available = True
+            print("✅ Avro Handler loaded.")
+        except ImportError:
+            print("❌ fastavro not found. Please run: pip install fastavro")
+    def convert_to_dataframe(self, file_bytes: bytes) -> pd.DataFrame:
+        """
+        Reads Avro bytes and converts them to a Pandas DataFrame.
+        """
+        if not self.available:
+            return pd.DataFrame()
+        try:
+            # Create a file-like object from bytes
+            f = io.BytesIO(file_bytes)
+            # Use fastavro to read records
+            reader = self.fastavro.reader(f)
+            records = [r for r in reader]
+            if not records:
+                return pd.DataFrame()
+            return pd.DataFrame(records)
+        except Exception as e:
+            print(f"⚠️ Avro Read Error: {e}")
+            return pd.DataFrame()

file_handlers/json_handler.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import json
+import pandas as pd
+import io
+class JsonHandler:
+    def __init__(self):
+        print("✅ JSON Handler loaded.")
+    def read_file(self, file_obj) -> pd.DataFrame:
+        """
+        Reads a JSON file object (or Streamlit UploadedFile) and flattens it.
+        """
+        try:
+            # Handle Streamlit UploadedFile (bytes) vs standard file path
+            if hasattr(file_obj, "getvalue"):
+                content = file_obj.getvalue()
+                data = json.loads(content.decode('utf-8'))
+            else:
+                data = json.load(file_obj)
+            # Recursive function to flatten nested JSONs
+            def flatten(x, name=''):
+                if type(x) is dict:
+                    out = {}
+                    for a in x: out.update(flatten(x[a], name + a + '_'))
+                    return out
+                elif type(x) is list:
+                    return {f"{name}list": str(x)}
+                else: return {name[:-1]: x}
+            # Normalize to DataFrame
+            if isinstance(data, list):
+                return pd.DataFrame([flatten(x) for x in data])
+            return pd.DataFrame([flatten(data)])
+        except Exception as e:
+            print(f"❌ JSON Read Error: {e}")
+            return pd.DataFrame()