Spaces:

rishabh5752
/

PII-Detector

Sleeping

App Files Files Community

rishabh5752 commited on Sep 8, 2025

Commit

98e775c

verified ·

1 Parent(s): 3e57e50

Added App.py and Requirements.txt

Browse files

Files changed (2) hide show

app (9).py +939 -0
requirements (8).txt +5 -0

app (9).py ADDED Viewed

	@@ -0,0 +1,939 @@

+import gradio as gr
+import re
+import json
+import torch
+from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+import faker
+from typing import List, Dict, Any, Optional
+import pandas as pd
+class EnhancedPiiProtectionPipeline:
+    """
+    A comprehensive PII protection pipeline that:
+    1. Uses regex for all detectable patterns first
+    2. Uses multiple custom NER models for remaining detection
+    3. Provides three protection methods: labeling, masking, and synthesis
+    4. Handles general, Indian-specific, address, and medical contexts
+    """
+    def __init__(
+        self,
+        main_model_name: str = "Kashish-jain/pii-protection-model",
+        medical_model_name: str = "Kashish-jain/pii-protection-medical",
+        use_medical_model: bool = False
+    ):
+        """
+        Initialize the comprehensive PII protection pipeline.
+        Args:
+            main_model_name: HuggingFace model name or path for the main PII model
+            medical_model_name: HuggingFace model name for the medical NER model
+            use_medical_model: Whether to load and use the medical model
+        """
+        # Main model
+        self.main_tokenizer = AutoTokenizer.from_pretrained(main_model_name)
+        self.main_model = pipeline("ner", model=main_model_name, tokenizer=self.main_tokenizer, aggregation_strategy="simple")
+        # Address-specific model - implementation simplified
+        self.address_model = self.main_model  # Fallback to main model for simplicity
+        # Medical model
+        self.use_medical_model = use_medical_model
+        self.medical_model = None
+        self.medical_tokenizer = None
+        if use_medical_model and medical_model_name:
+            try:
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                self.device = device
+                self.medical_tokenizer = AutoTokenizer.from_pretrained(medical_model_name)
+                self.medical_model = pipeline(
+                    "ner",
+                    model=medical_model_name,
+                    tokenizer=self.medical_tokenizer,
+                    aggregation_strategy="simple",
+                    device=0 if torch.cuda.is_available() else -1
+                )
+                print(f"Medical model '{medical_model_name}' loaded successfully")
+            except Exception as e:
+                print(f"Warning: Could not load medical model. Error: {str(e)}")
+                self.use_medical_model = False
+        self.faker = faker.Faker('en_IN')
+        # Set up regex patterns for common PII entities - IMPROVED PATTERNS
+        self.regex_patterns = {
+            # Phone numbers - Fixed to prevent partial matches
+            'PHONENUMBER': r'(?<!\w)(?:\+91[\-\s]?[789]\d{9}|(?:\+91[\-\s]?)?\d{3}[\-\.\s]?\d{3}[\-\.\s]?\d{4}|(?:\d{3}[\-\s]?){2}\d{4})(?!\d)',
+            # Email
+            'EMAIL': r'(?<!\w)[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}(?!\w)',
+            # IP addresses
+            'IPV4': r'(?<!\w)(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?!\w)',
+            # Credit cards
+            'CREDITCARDNUMBER': r'(?<!\w)(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|6(?:011|5\d{2})\d{12}|3[47]\d{13}|3(?:0[0-5]|[68]\d)\d{11}|(?:2131|1800|35\d{3})\d{11})(?!\w)',
+            # PAN (Indian Permanent Account Number)
+            'PAN': r'(?<!\w)[A-Z]{5}[0-9]{4}[A-Z](?!\w)',
+            # Aadhar (Indian ID)
+            'AADHAR': r'(?<!\w)(?:\d{4}\s\d{4}\s\d{4}|\d{12})(?!\d)',
+            # Passport
+            'PASSPORT': r'(?<!\w)[A-Z]{1,2}\d{7}(?!\w)',
+            # URL
+            'URL': r'(?<!\w)https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)(?!\w)',
+            # Dates
+            'DOB': r'(?<!\w)(?:0[1-9]|[12][0-9]|3[01])[/\-\.](?:0[1-9]|1[0-2])[/\-\.](?:19|20)\d{2}(?!\w)',
+            # PINCODE
+            'PINCODE': r'(?<!\w)(?:PIN[\s-]*)?\d{6}(?!\d)',
+            # Bank account & IBAN
+            'ACCOUNTNUMBER': r'(?<!\w)(?:A/C|Account|ACC)(?:ount)?\s*(?:Number|No|#)?[:\s-]*(\d{9,17})(?!\d)',
+            'IBAN_CODE': r'(?<!\w)(?:IBAN|International Bank Account Number)?[:\s]*[A-Z]{2}\d{2}[A-Z0-9]{4}[0-9]{7}(?:[0-9]{0,16})(?!\w)',
+            # Social Security Number (US)
+            'SSN': r'(?<!\w)\d{3}[-\s]?\d{2}[-\s]?\d{4}(?!\w)',
+            # Driver's License (simplified)
+            'DRIVER_LICENSE': r'(?<!\w)(?:[A-Z]{1,2}-\d{5,8}|\d{7,9}|[A-Z]\d{3}-\d{4}-\d{4}|\d{3}-\d{2}-\d{4})(?!\w)'
+        }
+        # Medical entity regex patterns - ENHANCED to only capture the value part, not label
+        self.medical_regex_patterns = {
+            'DOCTORNAME': r'(?:Dr\.?|Doctor)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
+            'PATIENTID': r'(?:Patient\s+ID|ID|MRN)[\s-]*[:]\s*([A-Z0-9]{5,12})',  # Modified to use a capture group
+            'MEDICALID': r'(?:Medical\s+Record|MRN|Patient\s+ID)[\s-]*[:]\s*([A-Z0-9]{4,15})',  # Modified to use a capture group
+        }
+        # Separated measurements with capture groups to get just the values, not labels
+        self.measurement_patterns = {
+            # Height with capture group for just the measurement value
+            'HEIGHT': r'(?:Height|Ht)[\s-]*[:]\s*((?:\d{1,2}\'\s*(?:\d{1,2}\")?|\d{3}\s*cm|\d{1,2}\.\d{1,2}\s*m))',
+            # Weight with capture group for just the measurement value
+            'WEIGHT': r'(?:Weight|Wt)[\s-]*[:]\s*((?:\d{1,3}(?:\.\d{1,2})?\s*(?:kg|lbs?|pounds?|kilograms?)))',
+            # Blood group/type with separate regex for the value only
+            'BLOOD_TYPE': r'(?:Blood\s+[Tt]ype|Blood\s+[Gg]roup)[\s-]*[:]\s*((?:A|B|AB|O)[+-])',
+        }
+        # Standalone measurement patterns (no labels)
+        self.standalone_medical_patterns = {
+            'HEIGHT_STANDALONE': r'(?<!\w)(?:\d{1,2}\'\s*\d{1,2}\"|\d{1,2}\'\d{1,2}\"|\d{1,2}\'|\d{3}\s*cm|\d{1,2}\.\d{1,2}\s*m)(?!\w)',
+            'WEIGHT_STANDALONE': r'(?<!\w)(?:\d{1,3}(?:\.\d{1,2})?\s*(?:kg|lbs?|pounds?|kilograms?))(?!\w)',
+            'BLOOD_TYPE_STANDALONE': r'(?<!\w)(?:A|B|AB|O)[+-](?!\w)'
+        }
+        # Combine all regex patterns
+        self.all_regex_patterns = {
+            **self.regex_patterns,
+            **self.medical_regex_patterns,
+            **self.measurement_patterns,
+            **self.standalone_medical_patterns
+        }
+    def regex_detection(self, text: str) -> List[Dict[str, Any]]:
+        """Detect PII using regex patterns with improved capture groups."""
+        entities = []
+        for entity_type, pattern in self.all_regex_patterns.items():
+            for match in re.finditer(pattern, text, re.IGNORECASE):
+                # For patterns with capture groups, use the first group if it exists
+                if match.groups() and match.group(1):
+                    # For labeled patterns with capture groups (e.g., "Height: 5'6"")
+                    captured_text = match.group(1)
+                    # Calculate start/end positions for the captured group
+                    start = match.start(1)
+                    end = match.end(1)
+                else:
+                    # For patterns without capture groups or standalone measurements
+                    captured_text = match.group(0)
+                    start = match.start(0)
+                    end = match.end(0)
+                # Handle standalone height/weight by renaming them
+                if entity_type == 'HEIGHT_STANDALONE':
+                    entity_type = 'HEIGHT'
+                elif entity_type == 'WEIGHT_STANDALONE':
+                    entity_type = 'WEIGHT'
+                elif entity_type == 'BLOOD_TYPE_STANDALONE':
+                    entity_type = 'BLOOD_TYPE'
+                entities.append({
+                    "text": captured_text,
+                    "label": entity_type,
+                    "start": start,
+                    "end": end,
+                    "score": 0.95,  # High confidence for regex matches
+                    "_original_text": text  # Store original text for context
+                })
+        return entities
+    def ner_detection(self, text: str, model_type: str = "main") -> List[Dict[str, Any]]:
+        """
+        Detect PII using NER models
+        Args:
+            text: Text to analyze
+            model_type: Type of model to use ("main", "medical")
+        """
+        if model_type == "medical" and not self.use_medical_model:
+            return []
+        model = self.medical_model if model_type == "medical" else self.main_model
+        try:
+            results = model(text)
+            # Convert to standard format
+            entities = []
+            for result in results:
+                # Skip low confidence predictions
+                if result.get('score', 0) < 0.5:
+                    continue
+                # Clean entity type
+                entity_type = result.get('entity_group', result.get('entity', '')).replace('B-', '').replace('I-', '')
+                entities.append({
+                    "text": result.get('word', text[result['start']:result['end']]),
+                    "label": entity_type,
+                    "start": result['start'],
+                    "end": result['end'],
+                    "score": result.get('score', 0.7),
+                    "_original_text": text  # Store original text for context
+                })
+            return entities
+        except Exception as e:
+            print(f"Error with NER detection: {str(e)}")
+            return []
+    def merge_entities(self, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Merge adjacent entities of the same or related types that likely form a single entity"""
+        if not entities:
+            return []
+        # Sort entities by start position
+        entities.sort(key=lambda x: x['start'])
+        merged = []
+        # Define related entity groups (entities that could be part of the same larger entity)
+        related_types = {
+            'NAME': ['FIRSTNAME', 'MIDDLENAME', 'LASTNAME', 'PREFIX'],
+            'ADDRESS': ['STREET', 'CITY', 'STATE', 'ZIPCODE', 'BUILDINGNUMBER'],
+            'PHONENUMBER': ['PHONENUMBER']  # Explicitly add PHONENUMBER to prevent merging with other types
+        }
+        # Flatten the related types for quick lookup
+        related_types_flat = {}
+        for main_type, sub_types in related_types.items():
+            for sub_type in sub_types:
+                related_types_flat[sub_type] = main_type
+        # Helper function to check if two entity types are related
+        def are_related(type1, type2):
+            # Same type is related
+            if type1 == type2:
+                return True
+            # Prevent merging PHONENUMBER with other types
+            if type1 == 'PHONENUMBER' or type2 == 'PHONENUMBER':
+                return type1 == type2
+            # Check if they're in the same group
+            for group, types in related_types.items():
+                if type1 in types and type2 in types:
+                    return True
+                if type1 == group and type2 in types:
+                    return True
+                if type2 == group and type1 in types:
+                    return True
+            # Check through the flattened related types
+            if type1 in related_types_flat and related_types_flat[type1] == type2:
+                return True
+            if type2 in related_types_flat and related_types_flat[type2] == type1:
+                return True
+            return False
+        for entity in entities:
+            if not merged:
+                merged.append(entity.copy())
+                continue
+            last = merged[-1]
+            # Maximum space between tokens that could be part of the same entity
+            # For adjacent words, this would typically be 1 (the space)
+            max_gap = 5
+            # Check if entities could be part of the same larger entity:
+            # 1. Same or related entity type
+            # 2. Within a reasonable distance
+            # 3. No other complete word between them
+            if (are_related(entity['label'], last['label']) and
+                entity['start'] - last['end'] <= max_gap):
+                # Get the text between the two entities
+                between_text = entity.get('_original_text', '')[last['end']:entity['start']] \
+                            if '_original_text' in entity and '_original_text' in last \
+                            else ' '
+                # Only merge if the gap contains just spaces or very simple punctuation
+                if between_text.strip() in ['', ' ', '.', ',', '-', '_']:
+                    # Create merged entity with all text between start and end
+                    if '_original_text' in entity and '_original_text' in last:
+                        full_text = last['_original_text'][last['start']:entity['end']]
+                    else:
+                        full_text = last['text'] + between_text + entity['text']
+                    last['text'] = full_text
+                    last['end'] = entity['end']
+                    # When merging different entity types, prefer the broader category
+                    if last['label'] in related_types_flat and entity['label'] == related_types_flat[last['label']]:
+                        last['label'] = entity['label']
+                    elif entity['label'] in related_types_flat and last['label'] == related_types_flat[entity['label']]:
+                        # Keep last['label'] as is
+                        pass
+                    last['score'] = max(last.get('score', 0), entity.get('score', 0))
+                else:
+                    merged.append(entity.copy())
+            else:
+                merged.append(entity.copy())
+        return merged
+    def remove_overlapping_entities(self, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Remove overlapping entities by keeping the highest scoring one"""
+        if not entities:
+            return []
+        # Sort by start position
+        entities.sort(key=lambda x: x['start'])
+        # Identify overlapping entities
+        non_overlapping = []
+        i = 0
+        while i < len(entities):
+            current = entities[i]
+            # Find all entities that overlap with the current one
+            overlapping = [current]
+            j = i + 1
+            while j < len(entities) and entities[j]['start'] < current['end']:
+                overlapping.append(entities[j])
+                j += 1
+            # Keep the highest scoring entity from overlapping group
+            if len(overlapping) > 1:
+                best_entity = max(overlapping, key=lambda x: x.get('score', 0))
+                non_overlapping.append(best_entity)
+            else:
+                non_overlapping.append(current)
+            # Move index to start after all overlapping entities
+            i = j
+        return non_overlapping
+    def generate_synthetic_value(self, entity_type: str, original_value: str = None) -> str:
+        """Generate realistic synthetic data for PII."""
+        try:
+            if entity_type in ['PERSON', 'NAME', 'FIRSTNAME', 'LASTNAME']:
+                return self.faker.name()
+            elif entity_type == 'EMAIL':
+                return self.faker.email()
+            elif entity_type == 'PHONENUMBER':
+                return self.faker.phone_number()
+            elif entity_type == 'PAN':
+                return self.faker.bothify('?????####?').upper()
+            elif entity_type == 'AADHAR':
+                return ' '.join([self.faker.numerify('####') for _ in range(3)])
+            elif entity_type == 'CREDITCARDNUMBER' or entity_type == 'CREDIT_CARD':
+                return self.faker.credit_card_number()
+            elif entity_type == 'ACCOUNTNUMBER' or entity_type == 'IBAN_CODE' or entity_type == 'BANK_NUMBER':
+                return self.faker.bban()
+            elif entity_type == 'PASSPORT' or entity_type == 'US_PASSPORT':
+                return f"{self.faker.random_letter().upper()}{self.faker.random_letter().upper()}{self.faker.numerify('######')}"
+            elif entity_type == 'DOB' or entity_type == 'DATE_TIME':
+                return self.faker.date_of_birth(minimum_age=18, maximum_age=90).strftime('%d/%m/%Y')
+            elif entity_type == 'IPV4' or entity_type == 'IP_ADDRESS':
+                return self.faker.ipv4()
+            elif entity_type == 'URL':
+                return self.faker.url()
+            elif entity_type == 'PINCODE':
+                return self.faker.postcode()
+            elif entity_type == 'CITY' or entity_type == 'LOCATION':
+                return self.faker.city()
+            elif entity_type == 'STATE':
+                return self.faker.state()
+            elif entity_type == 'SSN' or entity_type == 'US_SSN':
+                return self.faker.ssn()
+            elif entity_type == 'DRIVER_LICENSE' or entity_type == 'US_DRIVER_LICENSE':
+                return self.faker.bothify('?#######')
+            elif entity_type == 'CRYPTO':
+                return self.faker.cryptocurrency_code() + self.faker.bothify('??##??##??##??')
+            # Medical entity generation
+            elif entity_type == 'DOCTORNAME':
+                return f"Dr. {self.faker.last_name()}"
+            elif entity_type == 'PATIENTID' or entity_type == 'MEDICALID':
+                return self.faker.bothify('PT#######')
+            elif entity_type == 'HEIGHT':
+                # Generate a realistic height in feet and inches
+                feet = self.faker.random_int(min=4, max=6)
+                inches = self.faker.random_int(min=0, max=11)
+                return f"{feet}'{inches}\""
+            elif entity_type == 'WEIGHT':
+                # Generate a realistic weight in kg
+                weight = self.faker.random_int(min=45, max=100)
+                return f"{weight}kg"
+            elif entity_type == 'BLOOD_TYPE':
+                # Generate a random blood type
+                blood_groups = ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']
+                return self.faker.random_element(blood_groups)
+            else:
+                # Fallback for unknown types
+                return f"[SYNTHETIC_{entity_type}]"
+        except Exception as e:
+            print(f"Error generating synthetic value: {str(e)}")
+            return f"[SYNTHETIC_{entity_type}]"
+    def process_text(self, text: str, model_type: str = "main", protection_method: str = "replace") -> Dict[str, Any]:
+        """
+        Process text to detect and protect PII
+        Args:
+            text: Input text to process
+            model_type: Type of model to use ("main", "medical")
+            protection_method: Protection method ("replace", "mask", "synthesize")
+        Returns:
+            Dict containing protected text and detected entities
+        """
+        # Step 1: Get entities from regex
+        regex_entities = self.regex_detection(text)
+        # Step 2: Get entities from NER model
+        ner_entities = self.ner_detection(text, model_type)
+        # Step 3: Combine and process entities
+        all_entities = regex_entities + ner_entities
+        merged_entities = self.merge_entities(all_entities)
+        final_entities = self.remove_overlapping_entities(merged_entities)
+        # Step 4: Create protected text based on method
+        protected_text = text
+        # Sort entities by start position in reverse to avoid index issues when replacing
+        final_entities_sorted = sorted(final_entities, key=lambda x: x['start'], reverse=True)
+        if protection_method == "mask":
+            # Mask with asterisks
+            for entity in final_entities_sorted:
+                mask = '*' * len(entity['text'])
+                protected_text = protected_text[:entity['start']] + mask + protected_text[entity['end']:]
+        elif protection_method == "synthesize":
+            # Replace with synthetic values
+            for entity in final_entities_sorted:
+                synthetic = self.generate_synthetic_value(entity['label'], entity['text'])
+                protected_text = protected_text[:entity['start']] + synthetic + protected_text[entity['end']:]
+        else:  # replace (default)
+            # Replace with entity tags
+            for entity in final_entities_sorted:
+                tag = f"[{entity['label']}]"
+                protected_text = protected_text[:entity['start']] + tag + protected_text[entity['end']:]
+        # Create findings table
+        findings = []
+        for i, entity in enumerate(final_entities):
+            findings.append({
+                "index": i,
+                "entity_type": entity['label'],
+                "text": entity['text'],
+                "start": entity['start'],
+                "end": entity['end'],
+                "confidence": round(entity.get('score', 1.0), 2)
+            })
+        return {
+            "protected_text": protected_text,
+            "entities": final_entities,
+            "findings": findings
+        }
+# Example input text
+example_text = """
+Hi, my name is John Doe and I'm originally from Delhi.
+On 11/10/2024 I visited https://www.google.com and sent an email to abc@gmail.com, from IP 192.168.0.1.
+My phone number: +91-1234321216.
+"""
+medical_example_text = """
+Patient name: John Doe
+Date of Birth: 05/12/1982
+Patient ID: PT789456
+Contact: +91-9876543210
+Dr. Robert Johnson has prescribed medication penicillin on 12/12/2024.
+Blood type: O+, Height: 5'6", Weight: 145kg
+"""
+# Create Gradio Interface
+def process_input(text, model_type, protection_method):
+    # Initialize pipeline with Hugging Face model paths
+    main_model_name = "Kashish-jain/pii-protection-model"
+    medical_model_name = "Kashish-jain/pii-protection-medical"
+    use_medical = model_type == "medical"
+    pipeline = EnhancedPiiProtectionPipeline(
+        main_model_name=main_model_name,
+        medical_model_name=medical_model_name,
+        use_medical_model=use_medical
+    )
+    # Process the text
+    result = pipeline.process_text(text, model_type, protection_method)
+    # Create findings table
+    if result["findings"]:
+        df = pd.DataFrame(result["findings"])
+        df = df.rename(columns={
+            "index": "#",
+            "entity_type": "Entity type",
+            "text": "Text",
+            "start": "Start",
+            "end": "End",
+            "confidence": "Confidence"
+        })
+    else:
+        df = pd.DataFrame(columns=["#", "Entity type", "Text", "Start", "End", "Confidence"])
+    # Count detected entities by type
+    if result["findings"]:
+        entity_counts = df["Entity type"].value_counts().to_dict()
+        entity_summary = ", ".join([f"{count} {entity}" for entity, count in entity_counts.items()])
+    else:
+        entity_summary = "No entities detected"
+    return result["protected_text"], df, entity_summary
+# Update input text based on model type
+def update_input_text(model_type):
+    if model_type == "medical":
+        return medical_example_text
+    else:
+        return example_text
+# Custom CSS for a minimalistic, clean design
+custom_css = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Playfair+Display:wght@400;700&display=swap');
+:root {
+    --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+    --font-serif: 'Playfair Display', Georgia, Cambria, 'Times New Roman', Times, serif;
+    --color-primary: #2563eb;
+    --color-primary-light: #3b82f6;
+    --color-primary-dark: #1d4ed8;
+    --color-secondary: #64748b;
+    --color-secondary-light: #94a3b8;
+    --color-background: #00000f;
+    --color-surface: #f8fafc;
+    --color-border: #e2e8f0;
+    --color-text: #1e293b;
+    --color-text-light: #64748b;
+    --color-success: #10b981;
+    --color-warning: #f59e0b;
+    --color-error: #ef4444;
+    --shadow-sm: 0 1px 2px 0 rgba(0, 0, 0, 0.05);
+    --shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1), 0 1px 2px 0 rgba(0, 0, 0, 0.06);
+    --shadow-md: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
+    --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
+    --radius-sm: 0.25rem;
+    --radius: 0.375rem;
+    --radius-md: 0.5rem;
+    --radius-lg: 0.75rem;
+    --spacing-1: 0.25rem;
+    --spacing-2: 0.5rem;
+    --spacing-3: 0.75rem;
+    --spacing-4: 1rem;
+    --spacing-6: 1.5rem;
+    --spacing-8: 2rem;
+    --spacing-12: 3rem;
+}
+body, .gradio-container {
+    font-family: var(--font-sans);
+    color: var(--color-text);
+    background-color: var(--color-background);
+    line-height: 1.5;
+}
+/* Typography */
+h1, h2, h3 {
+    font-family: var(--font-serif);
+    font-weight: 700;
+    line-height: 1.2;
+    margin-bottom: var(--spacing-4);
+}
+h1 {
+    font-size: 2.25rem;
+    color: var(--color-text-light);
+}
+h2 {
+    font-size: 1.5rem;
+    color: var(--color-text);
+}
+h3 {
+    font-size: 1.25rem;
+    color: var(--color-text);
+}
+p {
+    margin-bottom: var(--spacing-4);
+}
+/* Layout Components */
+.container {
+    max-width: 1500px;
+    margin: 0 auto;
+    padding: var(--spacing-6);
+}
+.card {
+    background-color: var(--color-surface);
+    border-radius: var(--radius);
+    box-shadow: var(--shadow);
+    padding: var(--spacing-6);
+    margin-bottom: var(--spacing-6);
+    border: 1px solid var(--color-border);
+}
+/* Form Elements */
+.gradio-button.primary {
+    background-color: var(--color-secondary-light);
+    color: white;
+    font-weight: 500;
+    border-radius: var(--radius);
+    padding: var(--spacing-3) var(--spacing-6);
+    transition: all 0.2s ease;
+    border: none;
+    box-shadow: var(--shadow);
+}
+.gradio-button.primary:hover {
+    background-color: var(--color-secondary);
+    box-shadow: var(--shadow-md);
+    transform: translateY(-1px);
+}
+.gradio-button.primary:active {
+    transform: translateY(0);
+}
+.gradio-dropdown, .gradio-textbox, .gradio-textarea {
+    border-radius: var(--radius);
+    border: 1px solid var(--color-border);
+    padding: var(--spacing-3);
+    background-color: var(--color-background);
+    transition: border-color 0.2s ease;
+}
+.gradio-dropdown:focus, .gradio-textbox:focus, .gradio-textarea:focus {
+    border-color: var(--color-primary-light);
+    outline: none;
+    box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+}
+/* Tabs */
+.gradio-tabs {
+    margin-bottom: var(--spacing-6);
+}
+.gradio-tab-button {
+    padding: var(--spacing-3) var(--spacing-6);
+    font-weight: 500;
+    color: var(--color-text-light);
+    border-bottom: 2px solid transparent;
+    transition: all 0.2s ease;
+}
+.gradio-tab-button.selected {
+    color: var(--color-primary);
+    border-bottom-color: var(--color-primary);
+}
+/* Accordion */
+.gradio-accordion {
+    border: 1px solid var(--color-border);
+    border-radius: var(--radius);
+    margin-bottom: var(--spacing-6);
+    overflow: hidden;
+}
+.gradio-accordion-header {
+    padding: var(--spacing-4);
+    font-weight: 500;
+    background-color: var(--color-surface);
+    border-bottom: 1px solid var(--color-border);
+    cursor: pointer;
+}
+.gradio-accordion-content {
+    padding: var(--spacing-4);
+    background-color: var(--color-background);
+}
+/* Table */
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-bottom: var(--spacing-6);
+}
+th {
+    background-color: var(--color-surface);
+    padding: var(--spacing-3) var(--spacing-4);
+    text-align: left;
+    font-weight: 600;
+    color: var(--color-text);
+    border-bottom: 2px solid var(--color-border);
+}
+td {
+    padding: var(--spacing-3) var(--spacing-4);
+    border-bottom: 1px solid var(--color-border);
+}
+/* Dark mode support */
+@media (prefers-color-scheme: dark) {
+    :root {
+        --color-background: #0f172a;
+        --color-surface: #1e293b;
+        --color-border: #334155;
+        --color-text: #f8fafc;
+        --color-text-light: #cbd5e1;
+    }
+}
+/* Custom components */
+.entity-badge {
+    display: inline-block;
+    padding: 0.25rem 0.5rem;
+    border-radius: 9999px;
+    font-size: 0.75rem;
+    font-weight: 500;
+    background-color: var(--color-primary-light);
+    color: white;
+    margin-right: 0.5rem;
+    margin-bottom: 0.5rem;
+}
+.summary-container {
+    background-color: var(--color-surface);
+    border-radius: var(--radius);
+    padding: var(--spacing-4);
+    margin-bottom: var(--spacing-6);
+    border: 1px solid var(--color-border);
+}
+.icon-text {
+    display: flex;
+    align-items: center;
+    gap: var(--spacing-2);
+}
+.icon-text svg {
+    width: 1.25rem;
+    height: 1.25rem;
+    color: var(--color-primary);
+}
+/* Responsive adjustments */
+@media (max-width: 768px) {
+    .container {
+        padding: var(--spacing-4);
+    }
+    h1 {
+        font-size: 1.75rem;
+    }
+    .card {
+        padding: var(--spacing-4);
+    }
+}
+"""
+# Create the Gradio interface with enhanced styling
+with gr.Blocks(css=custom_css, theme=gr.themes.Base()) as demo:
+    # Header section
+    with gr.Column(elem_classes="container"):
+        gr.Markdown("""
+        # 🛡️ PII Protection Tool
+        Detect, protect and de-identify personally identifiable information.
+        """)
+        # Main content area
+        with gr.Column(elem_classes="card"):
+            # Configuration section
+            with gr.Row():
+                with gr.Column(scale=1):
+                    model_dropdown = gr.Dropdown(
+                        choices=[
+                            ("General Purpose", "main"),
+                            ("Medical Context", "medical")
+                        ],
+                        value="main",
+                        label="Model Type",
+                        elem_classes="form-control"
+                    )
+                with gr.Column(scale=1):
+                    protection_dropdown = gr.Dropdown(
+                        choices=[
+                            ("Replace with Tags", "replace"),
+                            ("Mask with Asterisks", "mask"),
+                            ("Generate Synthetic Data", "synthesize")
+                        ],
+                        value="replace",
+                        label="Protection Method",
+                        elem_classes="form-control"
+                    )
+            # Divider
+            gr.Markdown("---")
+            # Input/Output section
+            with gr.Row():
+                # Input column
+                with gr.Column():
+                    gr.Markdown("### Input Text")
+                    input_text = gr.TextArea(
+                        label="",
+                        value=example_text,
+                        lines=10,
+                        elem_classes="text-input"
+                    )
+                # Output column
+                with gr.Column():
+                    gr.Markdown("### Protected Output")
+                    output_text = gr.TextArea(
+                        label="",
+                        lines=10,
+                        elem_classes="text-output"
+                    )
+            # Summary section
+            with gr.Column(elem_classes="summary-container"):
+                gr.Markdown("### Entity Summary")
+                entity_summary = gr.Textbox(
+                    label="",
+                    interactive=False,
+                    elem_classes="entity-summary"
+                )
+            # Action button
+            submit_btn = gr.Button(
+                "Process Text",
+                variant="primary",
+                elem_classes="submit-button"
+            )
+        # Findings section
+        with gr.Column(elem_classes="card"):
+            gr.Markdown("### Detected Entities")
+            findings_table = gr.DataFrame(
+                headers=["#", "Entity type", "Text", "Start", "End", "Confidence"],
+                elem_classes="findings-table"
+            )
+        # Help section
+        with gr.Accordion("Help & Information", open=False, elem_classes="help-accordion"):
+            gr.Markdown("""
+            #### De-identification Methods
+            - **Replace with Tags**: Replaces each detected entity with its entity type tag (e.g., [NAME])
+            - **Mask with Asterisks**: Replaces each detected entity with asterisks (*)
+            - **Generate Synthetic Data**: Replaces each detected entity with realistic synthetic data
+            #### Model Types
+            - **General Purpose**: Optimized for common PII elements
+            - **Medical Context**: Enhanced detection for healthcare-related PII
+            #### Entity Types Detected
+            - **Personal**: NAME, EMAIL, PHONENUMBER, DOB
+            - **Financial**: CREDITCARDNUMBER, ACCOUNTNUMBER, PAN, IBAN_CODE, SSN
+            - **Location**: ADDRESS, CITY, STATE, PINCODE, IPV4
+            - **Medical**: DOCTORNAME, PATIENTID, MEDICALID
+            - **Other**: URL, PASSPORT, DRIVER_LICENSE
+            """)
+    # Set up event handlers
+    submit_btn.click(
+        fn=process_input,
+        inputs=[input_text, model_dropdown, protection_dropdown],
+        outputs=[output_text, findings_table, entity_summary]
+    )
+    model_dropdown.change(
+        fn=update_input_text,
+        inputs=[model_dropdown],
+        outputs=[input_text]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

requirements (8).txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.0.0
+transformers>=4.30.0
+torch>=2.0.0
+faker>=18.4.0
+pandas>=2.0.0