Spaces:
Sleeping
Sleeping
| from flask import Flask, request, render_template_string, redirect, url_for, jsonify | |
| import spacy | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import base64 | |
| from io import BytesIO | |
| import json | |
| from collections import Counter | |
| import time | |
| app = Flask(__name__) | |
| # Load spaCy models | |
| print("Loading NLP models...") | |
| nlp = spacy.load("en_core_web_sm") | |
| # HTML template with fixed samples and improved UI | |
| HTML_TEMPLATE = ''' | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>NER Analyzer</title> | |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet"> | |
| <style> | |
| body { | |
| background-color: #f8f9fa; | |
| padding: 20px; | |
| } | |
| .card { | |
| margin-bottom: 20px; | |
| border-radius: 8px; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| .entity { | |
| display: inline-block; | |
| border-radius: 3px; | |
| padding: 0px 5px; | |
| margin: 0 2px; | |
| } | |
| .person { background-color: #ffcccc; } | |
| .org { background-color: #ccffcc; } | |
| .gpe { background-color: #ccccff; } | |
| .date { background-color: #ffffcc; } | |
| .norp { background-color: #ffccff; } | |
| .money { background-color: #cceeff; } | |
| .product { background-color: #ffddcc; } | |
| .event { background-color: #ddffcc; } | |
| .work_of_art { background-color: #ccffff; } | |
| .law { background-color: #ffccee; } | |
| .language { background-color: #eeccff; } | |
| .percent { background-color: #cceeff; } | |
| .facility { background-color: #ffeecc; } | |
| .stat-card { | |
| text-align: center; | |
| padding: 15px; | |
| } | |
| .stat-value { | |
| font-size: 24px; | |
| font-weight: bold; | |
| } | |
| #loading-spinner { | |
| display: none; | |
| text-align: center; | |
| padding: 20px; | |
| } | |
| .entity-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| } | |
| .entity-table th { | |
| background-color: #f1f5f9; | |
| cursor: pointer; | |
| position: relative; | |
| user-select: none; | |
| padding: 12px; | |
| text-align: left; | |
| border: 1px solid #dee2e6; | |
| } | |
| .entity-table th.sortable:hover { | |
| background-color: #e2e8f0; | |
| } | |
| .entity-table th.sortable:after { | |
| content: "⇕"; | |
| position: absolute; | |
| right: 8px; | |
| color: #64748b; | |
| font-size: 12px; | |
| } | |
| .entity-table th.sort-asc:after { | |
| content: "↑"; | |
| } | |
| .entity-table th.sort-desc:after { | |
| content: "↓"; | |
| } | |
| .entity-table td { | |
| padding: 12px; | |
| text-align: left; | |
| border: 1px solid #dee2e6; | |
| vertical-align: middle; | |
| } | |
| .entity-table tr:nth-child(even) { | |
| background-color: #f8f9fa; | |
| } | |
| .entity-table tr:hover { | |
| background-color: #e9ecef; | |
| } | |
| .badge { | |
| font-size: 0.8rem; | |
| padding: 0.35em 0.65em; | |
| border-radius: 4px; | |
| } | |
| #entitySearch { | |
| width: 200px; | |
| } | |
| .sample-btn { | |
| transition: all 0.2s ease; | |
| } | |
| .sample-btn:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 4px 8px rgba(0,0,0,0.1); | |
| } | |
| .top-entities { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 8px; | |
| margin-bottom: 12px; | |
| } | |
| .top-entity-badge { | |
| padding: 6px 12px; | |
| border-radius: 20px; | |
| font-size: 0.9rem; | |
| font-weight: 500; | |
| display: flex; | |
| align-items: center; | |
| gap: 6px; | |
| } | |
| .circle-count { | |
| display: inline-flex; | |
| align-items: center; | |
| justify-content: center; | |
| width: 22px; | |
| height: 22px; | |
| border-radius: 50%; | |
| background-color: rgba(255,255,255,0.3); | |
| font-size: 0.75rem; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1 class="mb-4">Smart Entity Extraction</h1> | |
| {% if not results %} | |
| <!-- Input Form --> | |
| <div class="card p-4"> | |
| <h2 class="mb-3">Automated NER Detection</h2> | |
| <p>Extract relevant entities such as persons, locations, and organizations from your written content.</p> | |
| <form method="POST" action="/analyze" id="analyze-form"> | |
| <div class="mb-3"> | |
| <textarea name="text" id="text-input" class="form-control" rows="6" | |
| placeholder="Enter your text here...">{{ text }}</textarea> | |
| </div> | |
| <div class="mb-3"> | |
| <div class="form-check form-switch"> | |
| <input class="form-check-input" type="checkbox" id="enable-sentiment" name="enable_sentiment"> | |
| <label class="form-check-label" for="enable-sentiment">Enable Sentiment Analysis</label> | |
| </div> | |
| </div> | |
| <div class="d-flex justify-content-between"> | |
| <div> | |
| <p>You can find sample text to analyze: <a href="https://docs.google.com/document/d/1cGjZtA_fs3mzEJ71IkpCdwbES9-x4E9w-_shICfJh3Q/edit?usp=sharing">Google Docs</a></p> | |
| </div> | |
| <button type="submit" class="btn btn-primary" id="analyze-btn">Analyze Text</button> | |
| </div> | |
| </form> | |
| <div id="loading-spinner" class="mt-4"> | |
| <div class="spinner-border text-primary" role="status"> | |
| <span class="visually-hidden">Loading...</span> | |
| </div> | |
| <p class="mt-2">Analyzing text...</p> | |
| </div> | |
| </div> | |
| {% else %} | |
| <!-- Results --> | |
| <div class="mb-3"> | |
| <a href="/" class="btn btn-outline-primary">← Back</a> | |
| </div> | |
| <!-- Stats --> | |
| <div class="card p-4"> | |
| <h2 class="mb-3">Analysis Results</h2> | |
| <p><strong>Text Stats:</strong> {{ report.word_count }} words, {{ report.sentence_count }} sentences, {{ report.reading_time }} min read</p> | |
| <div class="row mb-4"> | |
| <div class="col-md-3"> | |
| <div class="card stat-card"> | |
| <div class="stat-value">{{ report.total_entities }}</div> | |
| <div>Total Entities</div> | |
| </div> | |
| </div> | |
| <div class="col-md-3"> | |
| <div class="card stat-card"> | |
| <div class="stat-value">{{ report.unique_entities }}</div> | |
| <div>Unique Entities</div> | |
| </div> | |
| </div> | |
| <div class="col-md-3"> | |
| <div class="card stat-card"> | |
| <div class="stat-value">{{ "%.1f"|format(report.entity_density*100) }}%</div> | |
| <div>Entity Density</div> | |
| </div> | |
| </div> | |
| <div class="col-md-3"> | |
| <div class="card stat-card"> | |
| <div class="stat-value">{{ report.entity_types }}</div> | |
| <div>Entity Types</div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Top 5 entities --> | |
| <div class="mb-4"> | |
| <h4 class="mb-3">Top Entities</h4> | |
| <div class="top-entities"> | |
| {% for (entity_text, entity_type), count in report.entity_details[:5] %} | |
| <div class="top-entity-badge {{ entity_type.lower() }}"> | |
| {{ entity_text }} | |
| <span class="circle-count">{{ count }}</span> | |
| </div> | |
| {% endfor %} | |
| </div> | |
| </div> | |
| {% if 'sentiment_score' in report %} | |
| <div class="row mb-2"> | |
| <div class="col-12"> | |
| <div class="card p-3"> | |
| <h4>Sentiment Analysis</h4> | |
| <div class="progress"> | |
| <div class="progress-bar {{ 'bg-success' if report.sentiment_score > 0 else 'bg-danger' }}" | |
| role="progressbar" | |
| style="width: {{ (report.sentiment_score + 1) * 50 }}%;" | |
| aria-valuenow="{{ report.sentiment_score }}" | |
| aria-valuemin="-1" | |
| aria-valuemax="1"> | |
| {{ "%.2f"|format(report.sentiment_score) }} | |
| </div> | |
| </div> | |
| <p class="mt-2 mb-0"> | |
| <small> | |
| <strong>Sentiment:</strong> | |
| {{ "Positive" if report.sentiment_score > 0.1 else "Negative" if report.sentiment_score < -0.1 else "Neutral" }} | |
| </small> | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| {% endif %} | |
| </div> | |
| <!-- Highlighted Text --> | |
| <div class="card p-4"> | |
| <h3 class="mb-3">Highlighted Entities</h3> | |
| <div class="p-3 bg-light rounded" id="highlighted-text">{{ highlighted|safe }}</div> | |
| </div> | |
| <!-- Entity Chart --> | |
| {% if plot_img %} | |
| <div class="card p-4"> | |
| <h3 class="mb-3">Entity Distribution</h3> | |
| <div class="text-center"> | |
| <img src="{{ plot_img }}" alt="Entity Distribution" class="img-fluid rounded"> | |
| </div> | |
| </div> | |
| {% endif %} | |
| <!-- Entities Table --> | |
| <div class="card p-4"> | |
| <h3 class="mb-3">Entities Table</h3> | |
| <div class="d-flex justify-content-between mb-3"> | |
| <div class="form-inline"> | |
| <input type="text" id="entitySearch" class="form-control form-control-sm" placeholder="Search entities..."> | |
| </div> | |
| <div> | |
| <select id="entitySort" class="form-select form-select-sm"> | |
| <option value="alpha">Sort Alphabetically</option> | |
| <option value="count" selected>Sort by Count (High to Low)</option> | |
| <option value="type">Sort by Entity Type</option> | |
| <option value="relevance">Sort by Relevance</option> | |
| </select> | |
| </div> | |
| </div> | |
| <div class="table-responsive"> | |
| <table class="entity-table table table-hover" id="entityTable"> | |
| <thead> | |
| <tr> | |
| <th class="sortable" data-sort="entity">Entity</th> | |
| <th class="sortable" data-sort="type">Type</th> | |
| <th class="sortable" data-sort="count">Count</th> | |
| <th class="sortable" data-sort="relevance">Relevance Score</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {% for (entity_text, entity_type), count in report.entity_details %} | |
| {% set relevance = (count / report.word_count * 100)|round(2) %} | |
| <tr> | |
| <td>{{ entity_text }}</td> | |
| <td> | |
| <span class="badge {{ entity_type.lower() }} text-dark"> | |
| {{ entity_type }} | |
| </span> | |
| </td> | |
| <td>{{ count }}</td> | |
| <td>{{ relevance }}%</td> | |
| </tr> | |
| {% endfor %} | |
| </tbody> | |
| </table> | |
| </div> | |
| <div class="mt-3 text-end"> | |
| <small class="text-muted">Relevance score shows entity frequency as percentage of total words.</small> | |
| </div> | |
| </div> | |
| {% endif %} | |
| </div> | |
| <script> | |
| document.addEventListener('DOMContentLoaded', function() { | |
| // Sample text handling | |
| const newsSample = "On March 5, 2024, Elena Rodriguez, a data scientist at OpenAI, announced a new breakthrough in artificial intelligence during a conference in San Francisco, California. The research, conducted in collaboration with MIT and funded by Google DeepMind, aims to improve language models' efficiency by 40% while reducing energy consumption by 30%. During the event, Dr. Michael Chen, CTO of NeuralTech Corp, highlighted the potential impact of this technology on industries ranging from healthcare to finance. "By 2025, we expect AI-driven solutions to contribute over $500 billion to the global economy," he stated. Meanwhile, in New York City, Mayor Lisa Patel announced a partnership with Tesla to deploy 1,000 autonomous electric buses across the city by December 2025. The project, estimated to cost $1.2 billion, is expected to reduce carbon emissions by 20% within the first year. At the same time, cybersecurity experts at IBM Research Lab discovered a new vulnerability in Windows 11, which could potentially affect over 10 million devices worldwide. The team, led by Rajesh Kapoor, advised users to update their systems immediately."; | |
| const businessSample = "On July 15, 2024, Johnson & Co., a leading investment firm based in Chicago, Illinois, announced a $750 million acquisition of BrightTech Solutions, a cybersecurity startup headquartered in Austin, Texas. The deal, negotiated by CEO Mark Stevenson and CFO Rachel Adams, aims to expand Johnson & Co.'s presence in the cybersecurity sector amid growing threats to corporate data. According to Bloomberg, the global cybersecurity market is projected to reach $310 billion by 2026, growing at an annual rate of 12.5%. Investors, including Vanguard Group and Sequoia Capital, have expressed confidence in the merger, with BrightTech's stock (BTSX) surging 18% following the announcement. Meanwhile, Amazon Web Services (AWS) unveiled a new cloud security framework during a technology summit in Seattle, Washington, aimed at helping Fortune 500 companies prevent data breaches. David Lin, AWS's Senior Vice President of Security, stated that the initiative could save businesses up to $50 million annually in cybersecurity-related losses. Elsewhere, Tesla Inc. reported record quarterly earnings of $18.6 billion, driven by strong demand for its Model Y and the expansion of its Gigafactory in Berlin, Germany. CEO Elon Musk attributed the success to advancements in battery technology and increased production efficiency."; | |
| const emailSample = "Subject: Strategic Acquisition & Market Insights – Key Updates Dear Team, I hope you're all having a productive week. I wanted to take a moment to update you on some significant industry developments that could shape our strategic direction in the coming months. On July 15, 2024, Johnson & Co., a major player in the investment space, announced its $750 million acquisition of BrightTech Solutions, a fast-growing cybersecurity firm based in Austin, Texas. This move, spearheaded by CEO Mark Stevenson and CFO Rachel Adams, strengthens their position in the cybersecurity market, an industry expected to hit $310 billion by 2026, according to Bloomberg. Interestingly, BrightTech's stock (BTSX) saw an 18% surge following the announcement, signaling strong investor confidence. Meanwhile, Amazon Web Services (AWS) unveiled a new cloud security framework during their summit in Seattle, Washington, aimed at helping Fortune 500 companies reduce cybersecurity risks. David Lin, AWS's Senior VP of Security, mentioned that this initiative could save enterprises up to $50 million annually in breach-related costs. On the automotive front, Tesla Inc. reported a record-breaking $18.6 billion in quarterly revenue, largely driven by strong Model Y sales and the expansion of their Berlin, Germany Gigafactory. Elon Musk highlighted recent advancements in battery technology and improved production efficiency as key factors behind their success. Given these shifts, I'd love to schedule some time next week to discuss how these trends might influence our strategic priorities, particularly in cybersecurity investments and emerging tech partnerships. Let me know your availability, and I'll coordinate accordingly. Looking forward to our discussion. Best,"; | |
| const personalSample = "Hope you're doing well! I was just reading about Astra Dynamics securing a $1.2 billion investment from Summit Ventures to scale their AI-driven logistics platform. It looks like they're aiming to disrupt supply chain inefficiencies in a big way—might be something worth keeping an eye on. Also, saw that NovaTech Industries is expanding into Berlin and Singapore, with plans to open new R&D centers by early 2025. Given how fast deep-tech innovation is moving, I feel like we're going to see some big shifts in automation and smart manufacturing soon. Anyway, just thought you'd find this interesting! Let's catch up soon—I'd love to hear what you're working on these days. Maybe a coffee next week? Let me know what works for you! Take care,"; | |
| // Get the buttons and text input elements | |
| const newsBtn = document.getElementById('sample-news'); | |
| const businessBtn = document.getElementById('sample-business'); | |
| const emailBtn = document.getElementById('sample-email'); | |
| const dmBtn = document.getElementById('sample-dm'); | |
| const textInput = document.getElementById('text-input'); | |
| const analyzeForm = document.getElementById('analyze-form'); | |
| const analyzeBtn = document.getElementById('analyze-btn'); | |
| const loadingSpinner = document.getElementById('loading-spinner'); | |
| // Event listeners for sample buttons - fixed implementation | |
| if (newsBtn && textInput) { | |
| newsBtn.onclick = function() { | |
| textInput.value = newsSample; | |
| }; | |
| } | |
| if (businessBtn && textInput) { | |
| businessBtn.onclick = function() { | |
| textInput.value = businessSample; | |
| }; | |
| } | |
| if (emailBtn && textInput) { | |
| emailBtn.onclick = function() { | |
| textInput.value = emailSample; | |
| }; | |
| } | |
| if (dmBtn && textInput) { | |
| dmBtn.onclick = function() { | |
| textInput.value = personalSample; | |
| }; | |
| } | |
| // Loading spinner | |
| if (analyzeForm && loadingSpinner && analyzeBtn) { | |
| analyzeForm.addEventListener('submit', function() { | |
| if (textInput && textInput.value.trim()) { | |
| loadingSpinner.style.display = 'block'; | |
| analyzeBtn.disabled = true; | |
| } | |
| }); | |
| } | |
| // Entity table search and sort functionality | |
| const entitySearch = document.getElementById('entitySearch'); | |
| const entitySort = document.getElementById('entitySort'); | |
| const entityTable = document.getElementById('entityTable'); | |
| if (entitySearch) { | |
| entitySearch.addEventListener('input', function() { | |
| const searchTerm = this.value.toLowerCase(); | |
| const rows = entityTable.querySelectorAll('tbody tr'); | |
| rows.forEach(row => { | |
| const entityName = row.cells[0].textContent.toLowerCase(); | |
| const entityType = row.cells[1].textContent.toLowerCase(); | |
| if (entityName.includes(searchTerm) || entityType.includes(searchTerm)) { | |
| row.style.display = ''; | |
| } else { | |
| row.style.display = 'none'; | |
| } | |
| }); | |
| }); | |
| } | |
| if (entitySort) { | |
| entitySort.addEventListener('change', function() { | |
| sortEntityTable(this.value); | |
| }); | |
| } | |
| // Add click event listeners to table headers for sorting | |
| if (entityTable) { | |
| const headers = entityTable.querySelectorAll('th.sortable'); | |
| headers.forEach(header => { | |
| header.addEventListener('click', function() { | |
| const sortKey = this.getAttribute('data-sort'); | |
| const currentSortOrder = this.classList.contains('sort-asc') ? 'desc' : 'asc'; | |
| // Remove sort classes from all headers | |
| headers.forEach(h => { | |
| h.classList.remove('sort-asc', 'sort-desc'); | |
| }); | |
| // Add sort class to clicked header | |
| this.classList.add(`sort-${currentSortOrder}`); | |
| // Sort table | |
| let sortType; | |
| switch(sortKey) { | |
| case 'entity': sortType = 'alpha'; break; | |
| case 'type': sortType = 'type'; break; | |
| case 'count': sortType = 'count'; break; | |
| case 'relevance': sortType = 'relevance'; break; | |
| default: sortType = 'count'; | |
| } | |
| sortEntityTable(sortType, currentSortOrder); | |
| }); | |
| }); | |
| } | |
| function sortEntityTable(sortBy, order = 'desc') { | |
| if (!entityTable) return; | |
| const tableBody = entityTable.querySelector('tbody'); | |
| if (!tableBody) return; | |
| const rows = Array.from(tableBody.querySelectorAll('tr')); | |
| rows.sort((a, b) => { | |
| let aValue, bValue; | |
| if (sortBy === 'alpha') { | |
| aValue = a.cells[0].textContent.toLowerCase(); | |
| bValue = b.cells[0].textContent.toLowerCase(); | |
| return order === 'asc' ? aValue.localeCompare(bValue) : bValue.localeCompare(aValue); | |
| } else if (sortBy === 'count') { | |
| aValue = parseInt(a.cells[2].textContent); | |
| bValue = parseInt(b.cells[2].textContent); | |
| return order === 'asc' ? aValue - bValue : bValue - aValue; | |
| } else if (sortBy === 'type') { | |
| aValue = a.cells[1].textContent.toLowerCase(); | |
| bValue = b.cells[1].textContent.toLowerCase(); | |
| return order === 'asc' ? aValue.localeCompare(bValue) : bValue.localeCompare(aValue); | |
| } else if (sortBy === 'relevance') { | |
| aValue = parseFloat(a.cells[3].textContent); | |
| bValue = parseFloat(b.cells[3].textContent); | |
| return order === 'asc' ? aValue - bValue : bValue - aValue; | |
| } | |
| return 0; | |
| }); | |
| // Remove existing rows | |
| while (tableBody.firstChild) { | |
| tableBody.removeChild(tableBody.firstChild); | |
| } | |
| // Add sorted rows | |
| rows.forEach(row => { | |
| tableBody.appendChild(row); | |
| }); | |
| } | |
| // Initialize sort if table exists | |
| if (entityTable && document.querySelector('.entity-table')) { | |
| sortEntityTable('count'); | |
| } | |
| }); | |
| </script> | |
| </body> | |
| </html> | |
| ''' | |
| def process_text(text): | |
| """Process text with spaCy NER""" | |
| start_time = time.time() | |
| doc = nlp(text) | |
| processing_time = time.time() - start_time | |
| print(f"Processing time: {processing_time:.2f} seconds") | |
| return doc | |
| def get_entity_counts(doc): | |
| """Count entities by type""" | |
| entity_counts = Counter([ent.label_ for ent in doc.ents]) | |
| return dict(entity_counts) | |
| def analyze_sentiment(doc): | |
| """Enhanced sentiment analysis""" | |
| # Define more extensive word lists | |
| positive_words = set([ | |
| 'good', 'great', 'excellent', 'positive', 'success', 'happy', 'improve', 'increase', | |
| 'breakthrough', 'innovative', 'advanced', 'efficient', 'beneficial', 'advantage', | |
| 'leading', 'opportunity', 'gain', 'profitable', 'growth', 'achievement' | |
| ]) | |
| negative_words = set([ | |
| 'bad', 'poor', 'negative', 'failure', 'problem', 'issue', 'decrease', 'risk', | |
| 'challenge', 'difficult', 'threat', 'loss', 'concern', 'weakness', 'crisis', | |
| 'vulnerability', 'dangerous', 'decline', 'struggle', 'error' | |
| ]) | |
| tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct] | |
| positive_count = sum(1 for token in tokens if token in positive_words) | |
| negative_count = sum(1 for token in tokens if token in negative_words) | |
| # Analyze negation patterns | |
| negation_words = ['not', 'no', "n't", 'never', 'neither', 'nor'] | |
| negated_positive = 0 | |
| negated_negative = 0 | |
| for i, token in enumerate(doc): | |
| if token.lower_ in negation_words and i+1 < len(doc): | |
| # Look at next few words for sentiment words | |
| for j in range(1, min(4, len(doc)-i)): | |
| next_word = doc[i+j].lemma_.lower() | |
| if next_word in positive_words: | |
| negated_positive += 1 | |
| positive_count = max(0, positive_count - 1) # Remove the positive count | |
| elif next_word in negative_words: | |
| negated_negative += 1 | |
| negative_count = max(0, negative_count - 1) # Remove the negative count | |
| # Convert negated positives to negatives and vice versa | |
| adjusted_positive = positive_count + negated_negative | |
| adjusted_negative = negative_count + negated_positive | |
| total = adjusted_positive + adjusted_negative | |
| if total == 0: | |
| return 0 | |
| return (adjusted_positive - adjusted_negative) / total | |
| def generate_entity_report(text, enable_sentiment=False): | |
| """Generate a comprehensive entity report""" | |
| doc = process_text(text) | |
| # Count entities | |
| entity_counts = get_entity_counts(doc) | |
| # Count total tokens for density calculation | |
| total_tokens = len(doc) | |
| entity_tokens = sum(len(ent) for ent in doc.ents) | |
| # Count entity occurrences with details for table | |
| entity_occurrences = Counter([(ent.text, ent.label_) for ent in doc.ents]) | |
| entity_details = entity_occurrences.most_common() | |
| # Calculate text statistics | |
| word_count = len([token for token in doc if not token.is_punct]) | |
| sentence_count = len(list(doc.sents)) | |
| reading_time = max(1, round(word_count / 200)) # Assuming 200 words per minute reading speed | |
| report = { | |
| "entity_counts": entity_counts, | |
| "total_entities": len(doc.ents), | |
| "unique_entities": len(entity_occurrences), | |
| "entity_density": entity_tokens / total_tokens if total_tokens > 0 else 0, | |
| "entity_details": entity_details, | |
| "entity_types": len(entity_counts), | |
| "word_count": word_count, | |
| "sentence_count": sentence_count, | |
| "reading_time": reading_time | |
| } | |
| # Add sentiment if enabled | |
| if enable_sentiment: | |
| report["sentiment_score"] = analyze_sentiment(doc) | |
| return report, doc | |
| def highlight_entities_html(doc): | |
| """Create HTML with highlighted entities""" | |
| text = doc.text | |
| result = text | |
| entities = sorted(doc.ents, key=lambda ent: ent.start_char, reverse=True) | |
| for ent in entities: | |
| start = ent.start_char | |
| end = ent.end_char | |
| entity_text = text[start:end] | |
| entity_type = ent.label_.lower() | |
| replacement = f'<span class="entity {entity_type}">{entity_text} <small>[{ent.label_}]</small></span>' | |
| result = result[:start] + replacement + result[end:] | |
| # Replace newlines with <br> tags | |
| result = result.replace('\n', '<br>') | |
| return result | |
| def generate_plot_base64(entity_counts): | |
| """Generate entity distribution plot""" | |
| if not entity_counts: | |
| return None | |
| df = pd.DataFrame(list(entity_counts.items()), columns=['Entity Type', 'Count']) | |
| df = df.sort_values('Count', ascending=False) | |
| plt.figure(figsize=(10, 5)) | |
| # Use a color palette | |
| colors = ['#4361ee', '#3a0ca3', '#7209b7', '#f72585', '#4cc9f0', | |
| '#ff9e00', '#38b000', '#9d4edd', '#fb5607', '#023e8a'] | |
| bars = plt.bar(df['Entity Type'], df['Count'], color=colors[:len(df)]) | |
| # Add count labels | |
| for bar in bars: | |
| height = bar.get_height() | |
| plt.text(bar.get_x() + bar.get_width()/2., height + 0.1, | |
| str(int(height)), ha='center', fontweight='bold') | |
| plt.title('Distribution of Named Entities') | |
| plt.xlabel('Entity Type') | |
| plt.ylabel('Count') | |
| plt.xticks(rotation=30) | |
| plt.tight_layout() | |
| # Convert plot to base64 for embedding in HTML | |
| buffer = BytesIO() | |
| plt.savefig(buffer, format='png', dpi=100) | |
| plt.close() | |
| buffer.seek(0) | |
| img_data = base64.b64encode(buffer.getvalue()).decode() | |
| return f"data:image/png;base64,{img_data}" | |
| def index(): | |
| # Render the form template | |
| return render_template_string(HTML_TEMPLATE, results=False, text="") | |
| def analyze(): | |
| text = request.form['text'] | |
| enable_sentiment = 'enable_sentiment' in request.form | |
| if not text.strip(): | |
| return redirect(url_for('index')) | |
| # Generate analysis | |
| report, doc = generate_entity_report(text, enable_sentiment) | |
| highlighted = highlight_entities_html(doc) | |
| plot_img = generate_plot_base64(report['entity_counts']) | |
| # Render the results template | |
| return render_template_string( | |
| HTML_TEMPLATE, | |
| results=True, | |
| report=report, | |
| highlighted=highlighted, | |
| plot_img=plot_img | |
| ) | |
| # API endpoint for programmatic access | |
| def api_analyze(): | |
| data = request.json | |
| if not data or 'text' not in data: | |
| return jsonify({'error': 'No text provided'}), 400 | |
| text = data['text'] | |
| enable_sentiment = data.get('enable_sentiment', False) | |
| # Generate analysis | |
| report, doc = generate_entity_report(text, enable_sentiment) | |
| entities = [{'text': ent.text, 'label': ent.label_, 'start': ent.start_char, 'end': ent.end_char} | |
| for ent in doc.ents] | |
| return jsonify({ | |
| 'report': report, | |
| 'entities': entities | |
| }) | |
| def calculate_privacy_risk(doc, entity_counts): | |
| """Calculate privacy risk score based on sensitive entities""" | |
| # Define weights for different entity types based on privacy sensitivity | |
| sensitivity_weights = { | |
| 'PERSON': 1.0, # Highest sensitivity | |
| 'ORG': 0.7, | |
| 'GPE': 0.6, # Locations | |
| 'MONEY': 0.9, | |
| 'DATE': 0.5, | |
| 'CARDINAL': 0.4, | |
| 'ORDINAL': 0.3, | |
| 'PRODUCT': 0.6, | |
| 'EMAIL': 1.0, # Add custom entity types for these | |
| 'PHONE': 1.0, | |
| 'ADDRESS': 1.0, | |
| 'SSN': 1.0 | |
| } | |
| base_score = sum(entity_counts.get(ent, 0) * sensitivity_weights.get(ent, 0.2) for ent in entity_counts) | |
| # Adjust for text length (shorter texts with many entities are higher risk) | |
| text_length = len(doc) | |
| density_factor = min(1.0, (sum(entity_counts.values()) / max(1, text_length)) * 100) | |
| # Scale to 0-100 | |
| risk_score = min(100, (base_score * density_factor * 10)) | |
| return risk_score | |
| if __name__ == '__main__': | |
| app.run(debug=True) |