NER-system / app.py
koulsahil's picture
Update app.py
3065d6f verified
from flask import Flask, request, render_template_string, redirect, url_for, jsonify
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import base64
from io import BytesIO
import json
from collections import Counter
import time
app = Flask(__name__)
# Load spaCy models
print("Loading NLP models...")
nlp = spacy.load("en_core_web_sm")
# HTML template with fixed samples and improved UI
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>NER Analyzer</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet">
<style>
body {
background-color: #f8f9fa;
padding: 20px;
}
.card {
margin-bottom: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.entity {
display: inline-block;
border-radius: 3px;
padding: 0px 5px;
margin: 0 2px;
}
.person { background-color: #ffcccc; }
.org { background-color: #ccffcc; }
.gpe { background-color: #ccccff; }
.date { background-color: #ffffcc; }
.norp { background-color: #ffccff; }
.money { background-color: #cceeff; }
.product { background-color: #ffddcc; }
.event { background-color: #ddffcc; }
.work_of_art { background-color: #ccffff; }
.law { background-color: #ffccee; }
.language { background-color: #eeccff; }
.percent { background-color: #cceeff; }
.facility { background-color: #ffeecc; }
.stat-card {
text-align: center;
padding: 15px;
}
.stat-value {
font-size: 24px;
font-weight: bold;
}
#loading-spinner {
display: none;
text-align: center;
padding: 20px;
}
.entity-table {
width: 100%;
border-collapse: collapse;
}
.entity-table th {
background-color: #f1f5f9;
cursor: pointer;
position: relative;
user-select: none;
padding: 12px;
text-align: left;
border: 1px solid #dee2e6;
}
.entity-table th.sortable:hover {
background-color: #e2e8f0;
}
.entity-table th.sortable:after {
content: "⇕";
position: absolute;
right: 8px;
color: #64748b;
font-size: 12px;
}
.entity-table th.sort-asc:after {
content: "↑";
}
.entity-table th.sort-desc:after {
content: "↓";
}
.entity-table td {
padding: 12px;
text-align: left;
border: 1px solid #dee2e6;
vertical-align: middle;
}
.entity-table tr:nth-child(even) {
background-color: #f8f9fa;
}
.entity-table tr:hover {
background-color: #e9ecef;
}
.badge {
font-size: 0.8rem;
padding: 0.35em 0.65em;
border-radius: 4px;
}
#entitySearch {
width: 200px;
}
.sample-btn {
transition: all 0.2s ease;
}
.sample-btn:hover {
transform: translateY(-2px);
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
.top-entities {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-bottom: 12px;
}
.top-entity-badge {
padding: 6px 12px;
border-radius: 20px;
font-size: 0.9rem;
font-weight: 500;
display: flex;
align-items: center;
gap: 6px;
}
.circle-count {
display: inline-flex;
align-items: center;
justify-content: center;
width: 22px;
height: 22px;
border-radius: 50%;
background-color: rgba(255,255,255,0.3);
font-size: 0.75rem;
}
</style>
</head>
<body>
<div class="container">
<h1 class="mb-4">Smart Entity Extraction</h1>
{% if not results %}
<!-- Input Form -->
<div class="card p-4">
<h2 class="mb-3">Automated NER Detection</h2>
<p>Extract relevant entities such as persons, locations, and organizations from your written content.</p>
<form method="POST" action="/analyze" id="analyze-form">
<div class="mb-3">
<textarea name="text" id="text-input" class="form-control" rows="6"
placeholder="Enter your text here...">{{ text }}</textarea>
</div>
<div class="mb-3">
<div class="form-check form-switch">
<input class="form-check-input" type="checkbox" id="enable-sentiment" name="enable_sentiment">
<label class="form-check-label" for="enable-sentiment">Enable Sentiment Analysis</label>
</div>
</div>
<div class="d-flex justify-content-between">
<div>
<p>You can find sample text to analyze: <a href="https://docs.google.com/document/d/1cGjZtA_fs3mzEJ71IkpCdwbES9-x4E9w-_shICfJh3Q/edit?usp=sharing">Google Docs</a></p>
</div>
<button type="submit" class="btn btn-primary" id="analyze-btn">Analyze Text</button>
</div>
</form>
<div id="loading-spinner" class="mt-4">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
<p class="mt-2">Analyzing text...</p>
</div>
</div>
{% else %}
<!-- Results -->
<div class="mb-3">
<a href="/" class="btn btn-outline-primary">← Back</a>
</div>
<!-- Stats -->
<div class="card p-4">
<h2 class="mb-3">Analysis Results</h2>
<p><strong>Text Stats:</strong> {{ report.word_count }} words, {{ report.sentence_count }} sentences, {{ report.reading_time }} min read</p>
<div class="row mb-4">
<div class="col-md-3">
<div class="card stat-card">
<div class="stat-value">{{ report.total_entities }}</div>
<div>Total Entities</div>
</div>
</div>
<div class="col-md-3">
<div class="card stat-card">
<div class="stat-value">{{ report.unique_entities }}</div>
<div>Unique Entities</div>
</div>
</div>
<div class="col-md-3">
<div class="card stat-card">
<div class="stat-value">{{ "%.1f"|format(report.entity_density*100) }}%</div>
<div>Entity Density</div>
</div>
</div>
<div class="col-md-3">
<div class="card stat-card">
<div class="stat-value">{{ report.entity_types }}</div>
<div>Entity Types</div>
</div>
</div>
</div>
<!-- Top 5 entities -->
<div class="mb-4">
<h4 class="mb-3">Top Entities</h4>
<div class="top-entities">
{% for (entity_text, entity_type), count in report.entity_details[:5] %}
<div class="top-entity-badge {{ entity_type.lower() }}">
{{ entity_text }}
<span class="circle-count">{{ count }}</span>
</div>
{% endfor %}
</div>
</div>
{% if 'sentiment_score' in report %}
<div class="row mb-2">
<div class="col-12">
<div class="card p-3">
<h4>Sentiment Analysis</h4>
<div class="progress">
<div class="progress-bar {{ 'bg-success' if report.sentiment_score > 0 else 'bg-danger' }}"
role="progressbar"
style="width: {{ (report.sentiment_score + 1) * 50 }}%;"
aria-valuenow="{{ report.sentiment_score }}"
aria-valuemin="-1"
aria-valuemax="1">
{{ "%.2f"|format(report.sentiment_score) }}
</div>
</div>
<p class="mt-2 mb-0">
<small>
<strong>Sentiment:</strong>
{{ "Positive" if report.sentiment_score > 0.1 else "Negative" if report.sentiment_score < -0.1 else "Neutral" }}
</small>
</p>
</div>
</div>
</div>
{% endif %}
</div>
<!-- Highlighted Text -->
<div class="card p-4">
<h3 class="mb-3">Highlighted Entities</h3>
<div class="p-3 bg-light rounded" id="highlighted-text">{{ highlighted|safe }}</div>
</div>
<!-- Entity Chart -->
{% if plot_img %}
<div class="card p-4">
<h3 class="mb-3">Entity Distribution</h3>
<div class="text-center">
<img src="{{ plot_img }}" alt="Entity Distribution" class="img-fluid rounded">
</div>
</div>
{% endif %}
<!-- Entities Table -->
<div class="card p-4">
<h3 class="mb-3">Entities Table</h3>
<div class="d-flex justify-content-between mb-3">
<div class="form-inline">
<input type="text" id="entitySearch" class="form-control form-control-sm" placeholder="Search entities...">
</div>
<div>
<select id="entitySort" class="form-select form-select-sm">
<option value="alpha">Sort Alphabetically</option>
<option value="count" selected>Sort by Count (High to Low)</option>
<option value="type">Sort by Entity Type</option>
<option value="relevance">Sort by Relevance</option>
</select>
</div>
</div>
<div class="table-responsive">
<table class="entity-table table table-hover" id="entityTable">
<thead>
<tr>
<th class="sortable" data-sort="entity">Entity</th>
<th class="sortable" data-sort="type">Type</th>
<th class="sortable" data-sort="count">Count</th>
<th class="sortable" data-sort="relevance">Relevance Score</th>
</tr>
</thead>
<tbody>
{% for (entity_text, entity_type), count in report.entity_details %}
{% set relevance = (count / report.word_count * 100)|round(2) %}
<tr>
<td>{{ entity_text }}</td>
<td>
<span class="badge {{ entity_type.lower() }} text-dark">
{{ entity_type }}
</span>
</td>
<td>{{ count }}</td>
<td>{{ relevance }}%</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="mt-3 text-end">
<small class="text-muted">Relevance score shows entity frequency as percentage of total words.</small>
</div>
</div>
{% endif %}
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
// Sample text handling
const newsSample = "On March 5, 2024, Elena Rodriguez, a data scientist at OpenAI, announced a new breakthrough in artificial intelligence during a conference in San Francisco, California. The research, conducted in collaboration with MIT and funded by Google DeepMind, aims to improve language models' efficiency by 40% while reducing energy consumption by 30%. During the event, Dr. Michael Chen, CTO of NeuralTech Corp, highlighted the potential impact of this technology on industries ranging from healthcare to finance. "By 2025, we expect AI-driven solutions to contribute over $500 billion to the global economy," he stated. Meanwhile, in New York City, Mayor Lisa Patel announced a partnership with Tesla to deploy 1,000 autonomous electric buses across the city by December 2025. The project, estimated to cost $1.2 billion, is expected to reduce carbon emissions by 20% within the first year. At the same time, cybersecurity experts at IBM Research Lab discovered a new vulnerability in Windows 11, which could potentially affect over 10 million devices worldwide. The team, led by Rajesh Kapoor, advised users to update their systems immediately.";
const businessSample = "On July 15, 2024, Johnson & Co., a leading investment firm based in Chicago, Illinois, announced a $750 million acquisition of BrightTech Solutions, a cybersecurity startup headquartered in Austin, Texas. The deal, negotiated by CEO Mark Stevenson and CFO Rachel Adams, aims to expand Johnson & Co.'s presence in the cybersecurity sector amid growing threats to corporate data. According to Bloomberg, the global cybersecurity market is projected to reach $310 billion by 2026, growing at an annual rate of 12.5%. Investors, including Vanguard Group and Sequoia Capital, have expressed confidence in the merger, with BrightTech's stock (BTSX) surging 18% following the announcement. Meanwhile, Amazon Web Services (AWS) unveiled a new cloud security framework during a technology summit in Seattle, Washington, aimed at helping Fortune 500 companies prevent data breaches. David Lin, AWS's Senior Vice President of Security, stated that the initiative could save businesses up to $50 million annually in cybersecurity-related losses. Elsewhere, Tesla Inc. reported record quarterly earnings of $18.6 billion, driven by strong demand for its Model Y and the expansion of its Gigafactory in Berlin, Germany. CEO Elon Musk attributed the success to advancements in battery technology and increased production efficiency.";
const emailSample = "Subject: Strategic Acquisition & Market Insights – Key Updates Dear Team, I hope you're all having a productive week. I wanted to take a moment to update you on some significant industry developments that could shape our strategic direction in the coming months. On July 15, 2024, Johnson & Co., a major player in the investment space, announced its $750 million acquisition of BrightTech Solutions, a fast-growing cybersecurity firm based in Austin, Texas. This move, spearheaded by CEO Mark Stevenson and CFO Rachel Adams, strengthens their position in the cybersecurity market, an industry expected to hit $310 billion by 2026, according to Bloomberg. Interestingly, BrightTech's stock (BTSX) saw an 18% surge following the announcement, signaling strong investor confidence. Meanwhile, Amazon Web Services (AWS) unveiled a new cloud security framework during their summit in Seattle, Washington, aimed at helping Fortune 500 companies reduce cybersecurity risks. David Lin, AWS's Senior VP of Security, mentioned that this initiative could save enterprises up to $50 million annually in breach-related costs. On the automotive front, Tesla Inc. reported a record-breaking $18.6 billion in quarterly revenue, largely driven by strong Model Y sales and the expansion of their Berlin, Germany Gigafactory. Elon Musk highlighted recent advancements in battery technology and improved production efficiency as key factors behind their success. Given these shifts, I'd love to schedule some time next week to discuss how these trends might influence our strategic priorities, particularly in cybersecurity investments and emerging tech partnerships. Let me know your availability, and I'll coordinate accordingly. Looking forward to our discussion. Best,";
const personalSample = "Hope you're doing well! I was just reading about Astra Dynamics securing a $1.2 billion investment from Summit Ventures to scale their AI-driven logistics platform. It looks like they're aiming to disrupt supply chain inefficiencies in a big way—might be something worth keeping an eye on. Also, saw that NovaTech Industries is expanding into Berlin and Singapore, with plans to open new R&D centers by early 2025. Given how fast deep-tech innovation is moving, I feel like we're going to see some big shifts in automation and smart manufacturing soon. Anyway, just thought you'd find this interesting! Let's catch up soon—I'd love to hear what you're working on these days. Maybe a coffee next week? Let me know what works for you! Take care,";
// Get the buttons and text input elements
const newsBtn = document.getElementById('sample-news');
const businessBtn = document.getElementById('sample-business');
const emailBtn = document.getElementById('sample-email');
const dmBtn = document.getElementById('sample-dm');
const textInput = document.getElementById('text-input');
const analyzeForm = document.getElementById('analyze-form');
const analyzeBtn = document.getElementById('analyze-btn');
const loadingSpinner = document.getElementById('loading-spinner');
// Event listeners for sample buttons - fixed implementation
if (newsBtn && textInput) {
newsBtn.onclick = function() {
textInput.value = newsSample;
};
}
if (businessBtn && textInput) {
businessBtn.onclick = function() {
textInput.value = businessSample;
};
}
if (emailBtn && textInput) {
emailBtn.onclick = function() {
textInput.value = emailSample;
};
}
if (dmBtn && textInput) {
dmBtn.onclick = function() {
textInput.value = personalSample;
};
}
// Loading spinner
if (analyzeForm && loadingSpinner && analyzeBtn) {
analyzeForm.addEventListener('submit', function() {
if (textInput && textInput.value.trim()) {
loadingSpinner.style.display = 'block';
analyzeBtn.disabled = true;
}
});
}
// Entity table search and sort functionality
const entitySearch = document.getElementById('entitySearch');
const entitySort = document.getElementById('entitySort');
const entityTable = document.getElementById('entityTable');
if (entitySearch) {
entitySearch.addEventListener('input', function() {
const searchTerm = this.value.toLowerCase();
const rows = entityTable.querySelectorAll('tbody tr');
rows.forEach(row => {
const entityName = row.cells[0].textContent.toLowerCase();
const entityType = row.cells[1].textContent.toLowerCase();
if (entityName.includes(searchTerm) || entityType.includes(searchTerm)) {
row.style.display = '';
} else {
row.style.display = 'none';
}
});
});
}
if (entitySort) {
entitySort.addEventListener('change', function() {
sortEntityTable(this.value);
});
}
// Add click event listeners to table headers for sorting
if (entityTable) {
const headers = entityTable.querySelectorAll('th.sortable');
headers.forEach(header => {
header.addEventListener('click', function() {
const sortKey = this.getAttribute('data-sort');
const currentSortOrder = this.classList.contains('sort-asc') ? 'desc' : 'asc';
// Remove sort classes from all headers
headers.forEach(h => {
h.classList.remove('sort-asc', 'sort-desc');
});
// Add sort class to clicked header
this.classList.add(`sort-${currentSortOrder}`);
// Sort table
let sortType;
switch(sortKey) {
case 'entity': sortType = 'alpha'; break;
case 'type': sortType = 'type'; break;
case 'count': sortType = 'count'; break;
case 'relevance': sortType = 'relevance'; break;
default: sortType = 'count';
}
sortEntityTable(sortType, currentSortOrder);
});
});
}
function sortEntityTable(sortBy, order = 'desc') {
if (!entityTable) return;
const tableBody = entityTable.querySelector('tbody');
if (!tableBody) return;
const rows = Array.from(tableBody.querySelectorAll('tr'));
rows.sort((a, b) => {
let aValue, bValue;
if (sortBy === 'alpha') {
aValue = a.cells[0].textContent.toLowerCase();
bValue = b.cells[0].textContent.toLowerCase();
return order === 'asc' ? aValue.localeCompare(bValue) : bValue.localeCompare(aValue);
} else if (sortBy === 'count') {
aValue = parseInt(a.cells[2].textContent);
bValue = parseInt(b.cells[2].textContent);
return order === 'asc' ? aValue - bValue : bValue - aValue;
} else if (sortBy === 'type') {
aValue = a.cells[1].textContent.toLowerCase();
bValue = b.cells[1].textContent.toLowerCase();
return order === 'asc' ? aValue.localeCompare(bValue) : bValue.localeCompare(aValue);
} else if (sortBy === 'relevance') {
aValue = parseFloat(a.cells[3].textContent);
bValue = parseFloat(b.cells[3].textContent);
return order === 'asc' ? aValue - bValue : bValue - aValue;
}
return 0;
});
// Remove existing rows
while (tableBody.firstChild) {
tableBody.removeChild(tableBody.firstChild);
}
// Add sorted rows
rows.forEach(row => {
tableBody.appendChild(row);
});
}
// Initialize sort if table exists
if (entityTable && document.querySelector('.entity-table')) {
sortEntityTable('count');
}
});
</script>
</body>
</html>
'''
def process_text(text):
"""Process text with spaCy NER"""
start_time = time.time()
doc = nlp(text)
processing_time = time.time() - start_time
print(f"Processing time: {processing_time:.2f} seconds")
return doc
def get_entity_counts(doc):
"""Count entities by type"""
entity_counts = Counter([ent.label_ for ent in doc.ents])
return dict(entity_counts)
def analyze_sentiment(doc):
"""Enhanced sentiment analysis"""
# Define more extensive word lists
positive_words = set([
'good', 'great', 'excellent', 'positive', 'success', 'happy', 'improve', 'increase',
'breakthrough', 'innovative', 'advanced', 'efficient', 'beneficial', 'advantage',
'leading', 'opportunity', 'gain', 'profitable', 'growth', 'achievement'
])
negative_words = set([
'bad', 'poor', 'negative', 'failure', 'problem', 'issue', 'decrease', 'risk',
'challenge', 'difficult', 'threat', 'loss', 'concern', 'weakness', 'crisis',
'vulnerability', 'dangerous', 'decline', 'struggle', 'error'
])
tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
positive_count = sum(1 for token in tokens if token in positive_words)
negative_count = sum(1 for token in tokens if token in negative_words)
# Analyze negation patterns
negation_words = ['not', 'no', "n't", 'never', 'neither', 'nor']
negated_positive = 0
negated_negative = 0
for i, token in enumerate(doc):
if token.lower_ in negation_words and i+1 < len(doc):
# Look at next few words for sentiment words
for j in range(1, min(4, len(doc)-i)):
next_word = doc[i+j].lemma_.lower()
if next_word in positive_words:
negated_positive += 1
positive_count = max(0, positive_count - 1) # Remove the positive count
elif next_word in negative_words:
negated_negative += 1
negative_count = max(0, negative_count - 1) # Remove the negative count
# Convert negated positives to negatives and vice versa
adjusted_positive = positive_count + negated_negative
adjusted_negative = negative_count + negated_positive
total = adjusted_positive + adjusted_negative
if total == 0:
return 0
return (adjusted_positive - adjusted_negative) / total
def generate_entity_report(text, enable_sentiment=False):
"""Generate a comprehensive entity report"""
doc = process_text(text)
# Count entities
entity_counts = get_entity_counts(doc)
# Count total tokens for density calculation
total_tokens = len(doc)
entity_tokens = sum(len(ent) for ent in doc.ents)
# Count entity occurrences with details for table
entity_occurrences = Counter([(ent.text, ent.label_) for ent in doc.ents])
entity_details = entity_occurrences.most_common()
# Calculate text statistics
word_count = len([token for token in doc if not token.is_punct])
sentence_count = len(list(doc.sents))
reading_time = max(1, round(word_count / 200)) # Assuming 200 words per minute reading speed
report = {
"entity_counts": entity_counts,
"total_entities": len(doc.ents),
"unique_entities": len(entity_occurrences),
"entity_density": entity_tokens / total_tokens if total_tokens > 0 else 0,
"entity_details": entity_details,
"entity_types": len(entity_counts),
"word_count": word_count,
"sentence_count": sentence_count,
"reading_time": reading_time
}
# Add sentiment if enabled
if enable_sentiment:
report["sentiment_score"] = analyze_sentiment(doc)
return report, doc
def highlight_entities_html(doc):
"""Create HTML with highlighted entities"""
text = doc.text
result = text
entities = sorted(doc.ents, key=lambda ent: ent.start_char, reverse=True)
for ent in entities:
start = ent.start_char
end = ent.end_char
entity_text = text[start:end]
entity_type = ent.label_.lower()
replacement = f'<span class="entity {entity_type}">{entity_text} <small>[{ent.label_}]</small></span>'
result = result[:start] + replacement + result[end:]
# Replace newlines with <br> tags
result = result.replace('\n', '<br>')
return result
def generate_plot_base64(entity_counts):
"""Generate entity distribution plot"""
if not entity_counts:
return None
df = pd.DataFrame(list(entity_counts.items()), columns=['Entity Type', 'Count'])
df = df.sort_values('Count', ascending=False)
plt.figure(figsize=(10, 5))
# Use a color palette
colors = ['#4361ee', '#3a0ca3', '#7209b7', '#f72585', '#4cc9f0',
'#ff9e00', '#38b000', '#9d4edd', '#fb5607', '#023e8a']
bars = plt.bar(df['Entity Type'], df['Count'], color=colors[:len(df)])
# Add count labels
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
str(int(height)), ha='center', fontweight='bold')
plt.title('Distribution of Named Entities')
plt.xlabel('Entity Type')
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.tight_layout()
# Convert plot to base64 for embedding in HTML
buffer = BytesIO()
plt.savefig(buffer, format='png', dpi=100)
plt.close()
buffer.seek(0)
img_data = base64.b64encode(buffer.getvalue()).decode()
return f"data:image/png;base64,{img_data}"
@app.route('/')
def index():
# Render the form template
return render_template_string(HTML_TEMPLATE, results=False, text="")
@app.route('/analyze', methods=['POST'])
def analyze():
text = request.form['text']
enable_sentiment = 'enable_sentiment' in request.form
if not text.strip():
return redirect(url_for('index'))
# Generate analysis
report, doc = generate_entity_report(text, enable_sentiment)
highlighted = highlight_entities_html(doc)
plot_img = generate_plot_base64(report['entity_counts'])
# Render the results template
return render_template_string(
HTML_TEMPLATE,
results=True,
report=report,
highlighted=highlighted,
plot_img=plot_img
)
# API endpoint for programmatic access
@app.route('/api/analyze', methods=['POST'])
def api_analyze():
data = request.json
if not data or 'text' not in data:
return jsonify({'error': 'No text provided'}), 400
text = data['text']
enable_sentiment = data.get('enable_sentiment', False)
# Generate analysis
report, doc = generate_entity_report(text, enable_sentiment)
entities = [{'text': ent.text, 'label': ent.label_, 'start': ent.start_char, 'end': ent.end_char}
for ent in doc.ents]
return jsonify({
'report': report,
'entities': entities
})
def calculate_privacy_risk(doc, entity_counts):
"""Calculate privacy risk score based on sensitive entities"""
# Define weights for different entity types based on privacy sensitivity
sensitivity_weights = {
'PERSON': 1.0, # Highest sensitivity
'ORG': 0.7,
'GPE': 0.6, # Locations
'MONEY': 0.9,
'DATE': 0.5,
'CARDINAL': 0.4,
'ORDINAL': 0.3,
'PRODUCT': 0.6,
'EMAIL': 1.0, # Add custom entity types for these
'PHONE': 1.0,
'ADDRESS': 1.0,
'SSN': 1.0
}
base_score = sum(entity_counts.get(ent, 0) * sensitivity_weights.get(ent, 0.2) for ent in entity_counts)
# Adjust for text length (shorter texts with many entities are higher risk)
text_length = len(doc)
density_factor = min(1.0, (sum(entity_counts.values()) / max(1, text_length)) * 100)
# Scale to 0-100
risk_score = min(100, (base_score * density_factor * 10))
return risk_score
if __name__ == '__main__':
app.run(debug=True)