Spaces:

ANISA09
/

ml

Sleeping

File size: 11,316 Bytes

import gradio as gr
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import torch
import pytesseract
import re
from datetime import datetime
import numpy as np

# Load Vision Transformer model from Hugging Face
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

def extract_text_from_image(image):
    """Extract text from certificate image using OCR"""
    try:
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        return f"OCR Error: {str(e)}"

def extract_dates(text):
    """Extract dates from text"""
    date_patterns = [
        r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
        r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
        r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}'
    ]
    
    dates = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        dates.extend(matches)
    
    return dates

def analyze_with_vit(image):
    """Use ViT model to classify image quality and authenticity markers"""
    inputs = processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Get confidence score
        probs = torch.nn.functional.softmax(logits, dim=-1)
        confidence = torch.max(probs).item() * 100
        
        # Get prediction
        predicted_class = logits.argmax(-1).item()
    
    return confidence, predicted_class

def compare_data(extracted_text, user_name, user_course, user_date, user_issuer):
    """Compare extracted data with user provided data"""
    matches = {
        'name': False,
        'course': False,
        'date': False,
        'issuer': False
    }
    
    issues = []
    score = 100
    
    # Clean text for comparison
    text_lower = extracted_text.lower()
    
    # Check Name
    if user_name.strip():
        if user_name.lower() in text_lower:
            matches['name'] = True
            issues.append(("✅", "Name match found", "good"))
        else:
            matches['name'] = False
            issues.append(("❌", f"Name '{user_name}' NOT found in certificate", "bad"))
            score -= 25
    
    # Check Course/Program
    if user_course.strip():
        course_words = user_course.lower().split()
        course_match = any(word in text_lower for word in course_words if len(word) > 3)
        
        if course_match:
            matches['course'] = True
            issues.append(("✅", "Course/Program match found", "good"))
        else:
            matches['course'] = False
            issues.append(("❌", f"Course '{user_course}' NOT found in certificate", "bad"))
            score -= 20
    
    # Check Date
    if user_date.strip():
        extracted_dates = extract_dates(extracted_text)
        date_found = any(user_date in date_str for date_str in extracted_dates)
        
        if date_found or user_date.replace('-', '/') in text_lower or user_date.replace('/', '-') in text_lower:
            matches['date'] = True
            issues.append(("✅", f"Date '{user_date}' verified", "good"))
        else:
            matches['date'] = False
            issues.append(("⚠️", f"Date '{user_date}' NOT found (Found: {', '.join(extracted_dates[:3]) if extracted_dates else 'None'})", "warning"))
            score -= 20
    
    # Check Issuer/Organization
    if user_issuer.strip():
        issuer_words = user_issuer.lower().split()
        issuer_match = any(word in text_lower for word in issuer_words if len(word) > 3)
        
        if issuer_match:
            matches['issuer'] = True
            issues.append(("✅", f"Issuer '{user_issuer}' verified", "good"))
        else:
            matches['issuer'] = False
            issues.append(("❌", f"Issuer '{user_issuer}' NOT found in certificate", "bad"))
            score -= 15
    
    return matches, issues, max(0, score)

def validate_certificate(image, user_name, user_course, user_date, user_issuer):
    """Main validation function"""
    
    if image is None:
        return "❌ Please upload an image", "", {}, 0
    
    # Convert to PIL Image if needed
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    
    # Step 1: Extract text using OCR
    extracted_text = extract_text_from_image(image)
    
    # Step 2: Use ViT model for image quality analysis
    vit_confidence, vit_class = analyze_with_vit(image)
    
    # Step 3: Compare extracted data with user data
    matches, comparison_issues, comparison_score = compare_data(
        extracted_text, user_name, user_course, user_date, user_issuer
    )
    
    # Step 4: Calculate final score
    # Weight: 40% ViT confidence, 60% data matching
    final_score = int((vit_confidence * 0.4) + (comparison_score * 0.6))
    
    # Step 5: Generate verdict
    if final_score >= 70 and comparison_score >= 70:
        verdict = "✅ CERTIFICATE VALID"
        verdict_color = "🟢"
        verdict_detail = "All verification checks passed. Certificate appears authentic."
    elif final_score >= 50:
        verdict = "⚠️ VERIFICATION NEEDED"
        verdict_color = "🟡"
        verdict_detail = "Some discrepancies found. Manual verification recommended."
    else:
        verdict = "❌ CERTIFICATE INVALID"
        verdict_color = "🔴"
        verdict_detail = "Multiple verification failures. Certificate likely fake or incorrect."
    
    # Create detailed report
    report = f"""
# {verdict_color} {verdict}

**Final Score:** {final_score}/100  
**ViT Model Confidence:** {vit_confidence:.1f}%  
**Data Match Score:** {comparison_score}/100

---

## 📊 Verification Results

### Data Comparison:
"""
    
    for emoji, issue, status in comparison_issues:
        report += f"\n{emoji} {issue}"
    
    report += f"""

---

## 🔍 Extracted Certificate Text:
```
{extracted_text[:500]}{'...' if len(extracted_text) > 500 else ''}
```

---

## 🤖 AI Model Analysis:
- **Model:** Google Vision Transformer (ViT)
- **Architecture:** ViT-Base-Patch16-224
- **Image Quality Score:** {vit_confidence:.1f}%
- **Classification:** Class {vit_class}

---

## ⚖️ Final Verdict:
{verdict_detail}

### Match Summary:
- Name: {"✅ Verified" if matches['name'] else "❌ Not Found"}
- Course: {"✅ Verified" if matches['course'] else "❌ Not Found"}
- Date: {"✅ Verified" if matches['date'] else "❌ Not Found"}
- Issuer: {"✅ Verified" if matches['issuer'] else "❌ Not Found"}

---

*⚠️ Disclaimer: This is an automated verification system. For legal purposes, 
please verify with the issuing authority.*
"""
    
    # Create JSON output
    json_output = {
        "verdict": verdict,
        "final_score": final_score,
        "vit_confidence": round(vit_confidence, 2),
        "data_match_score": comparison_score,
        "matches": matches,
        "extracted_text_preview": extracted_text[:200]
    }
    
    return report, extracted_text, json_output, final_score


# Create Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="AI Certificate Validator") as demo:
    
    gr.Markdown("""
    # 🛡️ AI-Powered Certificate Validation System
    
    ### Powered by Google's Vision Transformer (ViT) + OCR
    
    Upload a certificate image and provide the expected details. The AI will:
    1. Extract text using OCR (Optical Character Recognition)
    2. Analyze image quality using ViT deep learning model
    3. Compare extracted data with your provided information
    4. Generate a comprehensive validation report
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## 📤 Upload Certificate")
            image_input = gr.Image(
                label="Certificate Image",
                type="pil",
                sources=["upload", "clipboard", "webcam"]
            )
            
            gr.Markdown("## 📝 Expected Certificate Details")
            
            user_name = gr.Textbox(
                label="Full Name (as on certificate)",
                placeholder="e.g., John Smith",
                lines=1
            )
            
            user_course = gr.Textbox(
                label="Course/Program Name",
                placeholder="e.g., Machine Learning Certification",
                lines=1
            )
            
            user_date = gr.Textbox(
                label="Issue Date",
                placeholder="e.g., 2024-01-15 or Jan 15, 2024",
                lines=1
            )
            
            user_issuer = gr.Textbox(
                label="Issuing Organization",
                placeholder="e.g., Stanford University",
                lines=1
            )
            
            validate_btn = gr.Button("🔍 Validate Certificate", variant="primary", size="lg")
            
            gr.Markdown("""
            ### 💡 Tips:
            - Ensure certificate image is clear and readable
            - Provide exact details as they appear on certificate
            - Date format: YYYY-MM-DD or Month DD, YYYY
            """)
        
        with gr.Column(scale=1):
            gr.Markdown("## 📋 Validation Report")
            
            report_output = gr.Markdown(label="Analysis Report")
            
            score_output = gr.Number(
                label="Final Validation Score",
                precision=0
            )
            
            with gr.Accordion("📄 Extracted Text (OCR)", open=False):
                extracted_text_output = gr.Textbox(
                    label="Raw Extracted Text",
                    lines=10,
                    max_lines=20
                )
            
            with gr.Accordion("🔧 Technical Details (JSON)", open=False):
                json_output = gr.JSON(label="Detailed Results")
    
    # Connect button to function
    validate_btn.click(
        fn=validate_certificate,
        inputs=[image_input, user_name, user_course, user_date, user_issuer],
        outputs=[report_output, extracted_text_output, json_output, score_output]
    )
    
    gr.Markdown("""
    ---
    
    ## 🎯 How It Works:
    
    1. **Image Upload**: Certificate image is uploaded
    2. **OCR Processing**: Tesseract extracts all text from image
    3. **ViT Analysis**: Google's Vision Transformer analyzes image quality
    4. **Data Matching**: Compares extracted text with user-provided details
    5. **Scoring**: Combines AI confidence + data match accuracy
    6. **Verdict**: Generates final validation report
    
    ## 🔧 Technology Stack:
    - **AI Model**: Google Vision Transformer (ViT-Base-Patch16-224)
    - **OCR Engine**: Tesseract OCR
    - **Framework**: Hugging Face Transformers + Gradio
    - **Deployment**: Hugging Face Spaces
    
    ## 📊 Use Cases:
    - Academic certificate verification
    - Professional credential validation
    - Employment background checks
    - Document fraud detection
    
    ---
    
    **🚀 Created for Hackathon Demo**  
    *For production use, integrate with official verification APIs*
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)