Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- README.md +4 -6
- app.py +783 -0
- requirements.txt +5 -0
README.md
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
---
|
| 2 |
title: Havelock Demo Substring
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 6.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: Havelock Demo Substring
|
| 3 |
+
emoji: 🔬
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 6.3.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,783 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# v1.7 - Substring-level marker highlighting
|
| 2 |
+
"""
|
| 3 |
+
Havelock.AI - Orality Analyzer (Substring Staging)
|
| 4 |
+
|
| 5 |
+
Adds token-level span predictions from the trained MultiLabelTokenClassifier
|
| 6 |
+
(HavelockAI/bert-token-classifier) alongside existing sentence-level models.
|
| 7 |
+
|
| 8 |
+
Only Tier 1 markers (F1 >= 0.50) produce substring spans.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import gradio as gr
|
| 12 |
+
import torch
|
| 13 |
+
import torch.nn as nn
|
| 14 |
+
from transformers import (
|
| 15 |
+
AutoModel,
|
| 16 |
+
AutoTokenizer,
|
| 17 |
+
BertTokenizerFast,
|
| 18 |
+
BertModel,
|
| 19 |
+
BertForSequenceClassification,
|
| 20 |
+
)
|
| 21 |
+
from huggingface_hub import hf_hub_download
|
| 22 |
+
import json
|
| 23 |
+
import re
|
| 24 |
+
import threading
|
| 25 |
+
import time
|
| 26 |
+
import random
|
| 27 |
+
|
| 28 |
+
# Tracking endpoint
|
| 29 |
+
TRACK_URL = "https://havelock.ai/api/track"
|
| 30 |
+
|
| 31 |
+
# Language detection
|
| 32 |
+
try:
|
| 33 |
+
from langdetect import detect, DetectorFactory
|
| 34 |
+
DetectorFactory.seed = 0
|
| 35 |
+
HAS_LANGDETECT = True
|
| 36 |
+
except ImportError:
|
| 37 |
+
HAS_LANGDETECT = False
|
| 38 |
+
print("Warning: langdetect not installed. Language detection disabled.")
|
| 39 |
+
|
| 40 |
+
# Model repositories
|
| 41 |
+
MODEL_REPO = "thestalwart/havelock-orality"
|
| 42 |
+
TOKEN_MODEL_REPO = "HavelockAI/bert-token-classifier"
|
| 43 |
+
|
| 44 |
+
# Device
|
| 45 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 46 |
+
|
| 47 |
+
# Ensemble weights
|
| 48 |
+
DOC_MODEL_WEIGHT = 0.35
|
| 49 |
+
SENTENCE_WEIGHT = 0.65
|
| 50 |
+
|
| 51 |
+
# Sentence analysis cap
|
| 52 |
+
MAX_SENTENCES = 100
|
| 53 |
+
MAX_INPUT_CHARS = 50000
|
| 54 |
+
MAX_WORDS_PER_SENTENCE = 150
|
| 55 |
+
|
| 56 |
+
# Tier 1 markers: F1 >= 0.50 from manifest
|
| 57 |
+
TIER1_MARKERS = {
|
| 58 |
+
"oral_vocative",
|
| 59 |
+
"literate_technical_abbreviation",
|
| 60 |
+
"oral_phatic_check",
|
| 61 |
+
"oral_imperative",
|
| 62 |
+
"oral_specific_place",
|
| 63 |
+
"literate_citation",
|
| 64 |
+
"literate_agentless_passive",
|
| 65 |
+
"oral_rhetorical_question",
|
| 66 |
+
"oral_inclusive_we",
|
| 67 |
+
"oral_second_person",
|
| 68 |
+
"oral_named_individual",
|
| 69 |
+
"literate_nominalization",
|
| 70 |
+
"literate_probability",
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class BertOralityRegressor(nn.Module):
|
| 75 |
+
"""BERT model with regression head for orality scoring."""
|
| 76 |
+
def __init__(self, bert_model_name='bert-base-uncased', dropout=0.1):
|
| 77 |
+
super().__init__()
|
| 78 |
+
self.bert = BertModel.from_pretrained(bert_model_name)
|
| 79 |
+
self.dropout = nn.Dropout(dropout)
|
| 80 |
+
self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
|
| 81 |
+
self.sigmoid = nn.Sigmoid()
|
| 82 |
+
|
| 83 |
+
def forward(self, input_ids, attention_mask):
|
| 84 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
| 85 |
+
pooled_output = outputs.pooler_output
|
| 86 |
+
pooled_output = self.dropout(pooled_output)
|
| 87 |
+
logits = self.regressor(pooled_output)
|
| 88 |
+
return self.sigmoid(logits).squeeze(-1)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def load_models():
|
| 92 |
+
"""Download and load all models from HuggingFace Hub."""
|
| 93 |
+
print("Loading sentence-level models...")
|
| 94 |
+
|
| 95 |
+
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
|
| 96 |
+
|
| 97 |
+
# Load document regressor
|
| 98 |
+
doc_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_orality_regressor.pt")
|
| 99 |
+
doc_model = BertOralityRegressor().to(DEVICE)
|
| 100 |
+
doc_model.load_state_dict(torch.load(doc_model_path, map_location=DEVICE))
|
| 101 |
+
doc_model.eval()
|
| 102 |
+
|
| 103 |
+
# Load category classifier
|
| 104 |
+
cat_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_category.pt")
|
| 105 |
+
cat_labels_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_category_labels.json")
|
| 106 |
+
with open(cat_labels_path) as f:
|
| 107 |
+
category_labels = json.load(f)
|
| 108 |
+
category_model = BertForSequenceClassification.from_pretrained(
|
| 109 |
+
'bert-base-uncased', num_labels=len(category_labels)
|
| 110 |
+
).to(DEVICE)
|
| 111 |
+
category_model.load_state_dict(torch.load(cat_model_path, map_location=DEVICE))
|
| 112 |
+
category_model.eval()
|
| 113 |
+
|
| 114 |
+
# Load subtype classifier
|
| 115 |
+
sub_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_subtype.pt")
|
| 116 |
+
sub_labels_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_subtype_labels.json")
|
| 117 |
+
with open(sub_labels_path) as f:
|
| 118 |
+
subtype_labels = json.load(f)
|
| 119 |
+
subtype_model = BertForSequenceClassification.from_pretrained(
|
| 120 |
+
'bert-base-uncased', num_labels=len(subtype_labels),
|
| 121 |
+
attn_implementation="eager"
|
| 122 |
+
).to(DEVICE)
|
| 123 |
+
subtype_model.load_state_dict(torch.load(sub_model_path, map_location=DEVICE))
|
| 124 |
+
subtype_model.eval()
|
| 125 |
+
|
| 126 |
+
print("Sentence-level models loaded!")
|
| 127 |
+
return tokenizer, doc_model, category_model, category_labels, subtype_model, subtype_labels
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def load_token_classifier():
|
| 131 |
+
"""Load the token-level classifier from HuggingFace Hub."""
|
| 132 |
+
print("Loading token classifier...")
|
| 133 |
+
|
| 134 |
+
tok_tokenizer = AutoTokenizer.from_pretrained(TOKEN_MODEL_REPO)
|
| 135 |
+
tok_model = AutoModel.from_pretrained(TOKEN_MODEL_REPO, trust_remote_code=True)
|
| 136 |
+
tok_model.to(DEVICE)
|
| 137 |
+
tok_model.eval()
|
| 138 |
+
|
| 139 |
+
type_map_path = hf_hub_download(TOKEN_MODEL_REPO, "type_to_idx.json")
|
| 140 |
+
with open(type_map_path) as f:
|
| 141 |
+
type_to_idx = json.load(f)
|
| 142 |
+
idx_to_type = {v: k for k, v in type_to_idx.items()}
|
| 143 |
+
|
| 144 |
+
print(f"Token classifier loaded! ({len(type_to_idx)} marker types)")
|
| 145 |
+
return tok_tokenizer, tok_model, idx_to_type
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# Load all models at startup
|
| 149 |
+
tokenizer, doc_model, category_model, category_labels, subtype_model, subtype_labels = load_models()
|
| 150 |
+
tok_tokenizer, tok_model, idx_to_type = load_token_classifier()
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def split_sentences(text):
|
| 154 |
+
"""Split text into sentences."""
|
| 155 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 156 |
+
return [s.strip() for s in sentences if s.strip()]
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def predict_doc_score(text):
|
| 160 |
+
"""Predict document-level orality score with chunking for long texts."""
|
| 161 |
+
full_encoding = tokenizer(text, truncation=False, return_tensors='pt')
|
| 162 |
+
total_tokens = full_encoding['input_ids'].shape[1]
|
| 163 |
+
|
| 164 |
+
if total_tokens <= 512:
|
| 165 |
+
encoding = tokenizer(
|
| 166 |
+
text, truncation=True, max_length=512,
|
| 167 |
+
padding='max_length', return_tensors='pt'
|
| 168 |
+
)
|
| 169 |
+
with torch.no_grad():
|
| 170 |
+
score = doc_model(
|
| 171 |
+
encoding['input_ids'].to(DEVICE),
|
| 172 |
+
encoding['attention_mask'].to(DEVICE)
|
| 173 |
+
)
|
| 174 |
+
return score.item()
|
| 175 |
+
|
| 176 |
+
input_ids = full_encoding['input_ids'][0]
|
| 177 |
+
chunk_size = 512
|
| 178 |
+
stride = 448
|
| 179 |
+
scores = []
|
| 180 |
+
|
| 181 |
+
for start in range(0, total_tokens, stride):
|
| 182 |
+
end = min(start + chunk_size, total_tokens)
|
| 183 |
+
chunk_ids = input_ids[start:end].unsqueeze(0)
|
| 184 |
+
|
| 185 |
+
if chunk_ids.shape[1] < chunk_size:
|
| 186 |
+
pad_length = chunk_size - chunk_ids.shape[1]
|
| 187 |
+
chunk_ids = torch.nn.functional.pad(chunk_ids, (0, pad_length), value=tokenizer.pad_token_id)
|
| 188 |
+
|
| 189 |
+
attention_mask = (chunk_ids != tokenizer.pad_token_id).long()
|
| 190 |
+
|
| 191 |
+
with torch.no_grad():
|
| 192 |
+
score = doc_model(
|
| 193 |
+
chunk_ids.to(DEVICE),
|
| 194 |
+
attention_mask.to(DEVICE)
|
| 195 |
+
)
|
| 196 |
+
scores.append(score.item())
|
| 197 |
+
|
| 198 |
+
if end >= total_tokens:
|
| 199 |
+
break
|
| 200 |
+
|
| 201 |
+
return sum(scores) / len(scores)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def predict_category(text):
|
| 205 |
+
"""Predict oral vs literate for a sentence with confidence."""
|
| 206 |
+
encoding = tokenizer(
|
| 207 |
+
text, truncation=True, max_length=128,
|
| 208 |
+
padding='max_length', return_tensors='pt'
|
| 209 |
+
)
|
| 210 |
+
with torch.no_grad():
|
| 211 |
+
outputs = category_model(
|
| 212 |
+
encoding['input_ids'].to(DEVICE),
|
| 213 |
+
encoding['attention_mask'].to(DEVICE)
|
| 214 |
+
)
|
| 215 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 216 |
+
pred = torch.argmax(probs, dim=1).item()
|
| 217 |
+
confidence = probs[0][pred].item()
|
| 218 |
+
|
| 219 |
+
id_to_label = {v: k for k, v in category_labels.items()}
|
| 220 |
+
return id_to_label[pred], confidence
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def predict_subtype(text, top_k=3, threshold=0.05):
|
| 224 |
+
"""Predict marker subtype(s) for a sentence."""
|
| 225 |
+
encoding = tokenizer(
|
| 226 |
+
text, truncation=True, max_length=128,
|
| 227 |
+
padding='max_length', return_tensors='pt'
|
| 228 |
+
)
|
| 229 |
+
with torch.no_grad():
|
| 230 |
+
outputs = subtype_model(
|
| 231 |
+
encoding['input_ids'].to(DEVICE),
|
| 232 |
+
encoding['attention_mask'].to(DEVICE)
|
| 233 |
+
)
|
| 234 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 235 |
+
top_probs, top_indices = torch.topk(probs, k=top_k, dim=1)
|
| 236 |
+
|
| 237 |
+
id_to_label = {v: k for k, v in subtype_labels.items()}
|
| 238 |
+
|
| 239 |
+
markers = []
|
| 240 |
+
for prob, idx in zip(top_probs[0], top_indices[0]):
|
| 241 |
+
conf = prob.item()
|
| 242 |
+
if conf >= threshold:
|
| 243 |
+
markers.append({
|
| 244 |
+
'marker': id_to_label[idx.item()],
|
| 245 |
+
'confidence': round(conf, 3)
|
| 246 |
+
})
|
| 247 |
+
|
| 248 |
+
if not markers:
|
| 249 |
+
markers = [{
|
| 250 |
+
'marker': id_to_label[top_indices[0][0].item()],
|
| 251 |
+
'confidence': round(top_probs[0][0].item(), 3)
|
| 252 |
+
}]
|
| 253 |
+
|
| 254 |
+
return markers
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def predict_spans(text):
|
| 258 |
+
"""Run token classifier and return character-level spans for Tier 1 markers.
|
| 259 |
+
|
| 260 |
+
Returns a list of span dicts with character offsets into the original text.
|
| 261 |
+
Only markers in TIER1_MARKERS are included.
|
| 262 |
+
"""
|
| 263 |
+
encoding = tok_tokenizer(
|
| 264 |
+
text,
|
| 265 |
+
return_tensors="pt",
|
| 266 |
+
truncation=True,
|
| 267 |
+
max_length=128,
|
| 268 |
+
return_offsets_mapping=True,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
offset_mapping = encoding.pop("offset_mapping")[0] # (seq_len, 2)
|
| 272 |
+
input_ids = encoding["input_ids"].to(DEVICE)
|
| 273 |
+
attention_mask = encoding["attention_mask"].to(DEVICE)
|
| 274 |
+
|
| 275 |
+
with torch.no_grad():
|
| 276 |
+
if hasattr(tok_model, "decode"):
|
| 277 |
+
preds = tok_model.decode(input_ids, attention_mask)
|
| 278 |
+
else:
|
| 279 |
+
logits = tok_model(input_ids, attention_mask)
|
| 280 |
+
preds = logits.argmax(dim=-1)
|
| 281 |
+
|
| 282 |
+
# preds shape: (1, seq_len, num_types) where values are 0=O, 1=B, 2=I
|
| 283 |
+
preds = preds[0] # (seq_len, num_types)
|
| 284 |
+
seq_len = attention_mask.sum().item()
|
| 285 |
+
|
| 286 |
+
# Collect spans per marker type using BIO transitions
|
| 287 |
+
spans = []
|
| 288 |
+
|
| 289 |
+
for type_idx in range(preds.shape[1]):
|
| 290 |
+
marker_name = idx_to_type.get(type_idx)
|
| 291 |
+
if marker_name is None or marker_name not in TIER1_MARKERS:
|
| 292 |
+
continue
|
| 293 |
+
|
| 294 |
+
span_start_tok = None
|
| 295 |
+
|
| 296 |
+
for tok_pos in range(seq_len):
|
| 297 |
+
tag = preds[tok_pos, type_idx].item()
|
| 298 |
+
offsets = offset_mapping[tok_pos].tolist()
|
| 299 |
+
|
| 300 |
+
# Skip special tokens (offset 0,0)
|
| 301 |
+
if offsets[0] == 0 and offsets[1] == 0 and tok_pos > 0:
|
| 302 |
+
if span_start_tok is not None:
|
| 303 |
+
_emit_token_span(spans, text, offset_mapping, span_start_tok, tok_pos, marker_name)
|
| 304 |
+
span_start_tok = None
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
if tag == 1: # B
|
| 308 |
+
if span_start_tok is not None:
|
| 309 |
+
_emit_token_span(spans, text, offset_mapping, span_start_tok, tok_pos, marker_name)
|
| 310 |
+
span_start_tok = tok_pos
|
| 311 |
+
elif tag == 2: # I
|
| 312 |
+
if span_start_tok is None:
|
| 313 |
+
span_start_tok = tok_pos # orphan I, treat as B
|
| 314 |
+
else: # O
|
| 315 |
+
if span_start_tok is not None:
|
| 316 |
+
_emit_token_span(spans, text, offset_mapping, span_start_tok, tok_pos, marker_name)
|
| 317 |
+
span_start_tok = None
|
| 318 |
+
|
| 319 |
+
if span_start_tok is not None:
|
| 320 |
+
_emit_token_span(spans, text, offset_mapping, span_start_tok, seq_len, marker_name)
|
| 321 |
+
|
| 322 |
+
# Sort by start position
|
| 323 |
+
spans.sort(key=lambda s: (s["start"], s["end"]))
|
| 324 |
+
return spans
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def _emit_token_span(spans, text, offset_mapping, start_tok, end_tok, marker_name):
|
| 328 |
+
"""Convert token indices to a character-level span dict."""
|
| 329 |
+
char_start = int(offset_mapping[start_tok][0])
|
| 330 |
+
char_end = int(offset_mapping[end_tok - 1][1])
|
| 331 |
+
if char_end > char_start:
|
| 332 |
+
span_text = text[char_start:char_end]
|
| 333 |
+
category = "oral" if marker_name.startswith("oral_") else "literate"
|
| 334 |
+
spans.append({
|
| 335 |
+
"text": span_text,
|
| 336 |
+
"marker": marker_name,
|
| 337 |
+
"category": category,
|
| 338 |
+
"start": char_start,
|
| 339 |
+
"end": char_end,
|
| 340 |
+
})
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def get_attention_spans(text, threshold=0.15):
|
| 344 |
+
"""Extract high-attention token spans from the subtype model."""
|
| 345 |
+
encoding = tokenizer(
|
| 346 |
+
text, truncation=True, max_length=128,
|
| 347 |
+
padding='max_length', return_tensors='pt',
|
| 348 |
+
return_offsets_mapping=True
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
offset_mapping = encoding.pop('offset_mapping')[0]
|
| 352 |
+
input_ids = encoding['input_ids'].to(DEVICE)
|
| 353 |
+
attention_mask = encoding['attention_mask'].to(DEVICE)
|
| 354 |
+
|
| 355 |
+
with torch.no_grad():
|
| 356 |
+
outputs = subtype_model.bert(
|
| 357 |
+
input_ids=input_ids,
|
| 358 |
+
attention_mask=attention_mask,
|
| 359 |
+
output_attentions=True
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
last_layer = outputs.attentions[-1][0]
|
| 363 |
+
cls_attention = last_layer.mean(dim=0)[0]
|
| 364 |
+
|
| 365 |
+
max_attn = cls_attention.max()
|
| 366 |
+
if max_attn > 0:
|
| 367 |
+
cls_attention = cls_attention / max_attn
|
| 368 |
+
|
| 369 |
+
seq_len = attention_mask.sum().item()
|
| 370 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids[0][:seq_len])
|
| 371 |
+
offsets = offset_mapping[:seq_len].tolist()
|
| 372 |
+
attn_values = cls_attention[:seq_len].tolist()
|
| 373 |
+
|
| 374 |
+
PUNCT = {'.', ',', '!', '?', ';', ':', '-', '--', "'", '"',
|
| 375 |
+
'(', ')', '[', ']', '...', '\u2013', '\u2014'}
|
| 376 |
+
|
| 377 |
+
spans = []
|
| 378 |
+
in_span = False
|
| 379 |
+
span_start_idx = 0
|
| 380 |
+
span_attentions = []
|
| 381 |
+
|
| 382 |
+
for i, (token, attn, (cs, ce)) in enumerate(zip(tokens, attn_values, offsets)):
|
| 383 |
+
if cs == 0 and ce == 0:
|
| 384 |
+
if in_span and span_attentions:
|
| 385 |
+
_emit_attn_span(spans, text, offsets, span_start_idx, i, span_attentions)
|
| 386 |
+
in_span = False
|
| 387 |
+
span_attentions = []
|
| 388 |
+
continue
|
| 389 |
+
|
| 390 |
+
if token in PUNCT:
|
| 391 |
+
if in_span and span_attentions:
|
| 392 |
+
_emit_attn_span(spans, text, offsets, span_start_idx, i, span_attentions)
|
| 393 |
+
in_span = False
|
| 394 |
+
span_attentions = []
|
| 395 |
+
continue
|
| 396 |
+
|
| 397 |
+
if attn >= threshold:
|
| 398 |
+
if not in_span:
|
| 399 |
+
in_span = True
|
| 400 |
+
span_start_idx = i
|
| 401 |
+
span_attentions = [attn]
|
| 402 |
+
else:
|
| 403 |
+
span_attentions.append(attn)
|
| 404 |
+
else:
|
| 405 |
+
if in_span and span_attentions:
|
| 406 |
+
_emit_attn_span(spans, text, offsets, span_start_idx, i, span_attentions)
|
| 407 |
+
in_span = False
|
| 408 |
+
span_attentions = []
|
| 409 |
+
|
| 410 |
+
if in_span and span_attentions:
|
| 411 |
+
_emit_attn_span(spans, text, offsets, span_start_idx, seq_len, span_attentions)
|
| 412 |
+
|
| 413 |
+
return spans
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def _emit_attn_span(spans, text, offsets, start_idx, end_idx, attentions):
|
| 417 |
+
"""Helper to build an attention span dict from token indices."""
|
| 418 |
+
char_start = int(offsets[start_idx][0])
|
| 419 |
+
char_end = int(offsets[end_idx - 1][1])
|
| 420 |
+
if char_end > char_start:
|
| 421 |
+
spans.append({
|
| 422 |
+
'char_start': char_start,
|
| 423 |
+
'char_end': char_end,
|
| 424 |
+
'text': text[char_start:char_end],
|
| 425 |
+
'attention': round(sum(attentions) / len(attentions), 3)
|
| 426 |
+
})
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def detect_language(text):
|
| 430 |
+
"""Detect language of text."""
|
| 431 |
+
if not HAS_LANGDETECT:
|
| 432 |
+
return None, True, None
|
| 433 |
+
try:
|
| 434 |
+
if len(text.split()) < 10:
|
| 435 |
+
return None, True, None
|
| 436 |
+
lang = detect(text[:1000])
|
| 437 |
+
is_english = lang == 'en'
|
| 438 |
+
if not is_english:
|
| 439 |
+
warning = f"Non-English text detected ({lang}). Results may be unreliable as the model was trained primarily on English text."
|
| 440 |
+
return lang, False, warning
|
| 441 |
+
return lang, True, None
|
| 442 |
+
except Exception:
|
| 443 |
+
return None, True, None
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def log_usage(text, score, oral_count, literate_count, sentence_results):
|
| 447 |
+
"""Log API usage to Cloudflare D1 via track endpoint (fire and forget)."""
|
| 448 |
+
def _send():
|
| 449 |
+
try:
|
| 450 |
+
import urllib.request
|
| 451 |
+
import urllib.error
|
| 452 |
+
|
| 453 |
+
words = text.split() if text else []
|
| 454 |
+
sentences = [s for s in re.split(r'[.!?]+', text) if s.strip()] if text else []
|
| 455 |
+
unique_words = len(set(w.lower() for w in words if w.isalpha()))
|
| 456 |
+
|
| 457 |
+
word_count = len(words)
|
| 458 |
+
sentence_count = len(sentences)
|
| 459 |
+
avg_sentence_length = round(word_count / sentence_count, 1) if sentence_count else 0
|
| 460 |
+
alpha_words = [w for w in words if w.isalpha()]
|
| 461 |
+
avg_word_length = round(sum(len(w) for w in alpha_words) / len(alpha_words), 1) if alpha_words else 0
|
| 462 |
+
lexical_diversity = round((unique_words / word_count) * 100) if word_count else 0
|
| 463 |
+
|
| 464 |
+
marker_counts = {}
|
| 465 |
+
for sent in sentence_results:
|
| 466 |
+
marker = sent.get('marker', '')
|
| 467 |
+
if marker:
|
| 468 |
+
marker_counts[marker] = marker_counts.get(marker, 0) + 1
|
| 469 |
+
|
| 470 |
+
analysis_id = hex(int(time.time()))[2:] + hex(random.randint(0, 0xFFFFFFFF))[2:]
|
| 471 |
+
|
| 472 |
+
payload = {
|
| 473 |
+
"page": "huggingface_api",
|
| 474 |
+
"text": text[:10000] if text else "",
|
| 475 |
+
"score": score,
|
| 476 |
+
"word_count": word_count,
|
| 477 |
+
"sentence_count": sentence_count,
|
| 478 |
+
"avg_sentence_length": avg_sentence_length,
|
| 479 |
+
"avg_word_length": avg_word_length,
|
| 480 |
+
"lexical_diversity": lexical_diversity,
|
| 481 |
+
"oral_marker_count": oral_count,
|
| 482 |
+
"literate_marker_count": literate_count,
|
| 483 |
+
"markers_json": json.dumps(marker_counts) if marker_counts else None,
|
| 484 |
+
"analysis_id": analysis_id,
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
data = json.dumps(payload).encode('utf-8')
|
| 488 |
+
req = urllib.request.Request(
|
| 489 |
+
TRACK_URL,
|
| 490 |
+
data=data,
|
| 491 |
+
headers={'Content-Type': 'application/json', 'User-Agent': 'HavelockSpace/1.7'},
|
| 492 |
+
method='POST'
|
| 493 |
+
)
|
| 494 |
+
urllib.request.urlopen(req, timeout=5)
|
| 495 |
+
except Exception as e:
|
| 496 |
+
print(f"Tracking failed: {e}")
|
| 497 |
+
|
| 498 |
+
thread = threading.Thread(target=_send, daemon=True)
|
| 499 |
+
thread.start()
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def analyze_api(text):
|
| 503 |
+
"""JSON API for website integration.
|
| 504 |
+
|
| 505 |
+
v1.7: Adds spans field with token-level marker predictions (Tier 1 only).
|
| 506 |
+
"""
|
| 507 |
+
if not text or len(text.strip()) < 10:
|
| 508 |
+
return {"error": "Please enter at least 10 characters of text."}
|
| 509 |
+
|
| 510 |
+
if len(text) > MAX_INPUT_CHARS:
|
| 511 |
+
return {"error": f"Text too long. Maximum {MAX_INPUT_CHARS:,} characters allowed ({len(text):,} provided)."}
|
| 512 |
+
|
| 513 |
+
# Language detection
|
| 514 |
+
lang_code, is_english, lang_warning = detect_language(text)
|
| 515 |
+
|
| 516 |
+
# Document-level score
|
| 517 |
+
doc_score = predict_doc_score(text)
|
| 518 |
+
|
| 519 |
+
# Sentence-level analysis
|
| 520 |
+
sentences = split_sentences(text)
|
| 521 |
+
total_sentence_count = len(sentences)
|
| 522 |
+
truncated = total_sentence_count > MAX_SENTENCES
|
| 523 |
+
if truncated:
|
| 524 |
+
sentences = sentences[:MAX_SENTENCES]
|
| 525 |
+
|
| 526 |
+
oral_count = 0
|
| 527 |
+
literate_count = 0
|
| 528 |
+
oral_weighted = 0.0
|
| 529 |
+
literate_weighted = 0.0
|
| 530 |
+
sentence_results = []
|
| 531 |
+
|
| 532 |
+
for sent in sentences:
|
| 533 |
+
word_count = len(sent.split())
|
| 534 |
+
if word_count < 3 or word_count > MAX_WORDS_PER_SENTENCE:
|
| 535 |
+
continue
|
| 536 |
+
|
| 537 |
+
category, cat_confidence = predict_category(sent)
|
| 538 |
+
markers = predict_subtype(sent)
|
| 539 |
+
attention_spans = get_attention_spans(sent)
|
| 540 |
+
spans = predict_spans(sent)
|
| 541 |
+
|
| 542 |
+
if category == 'oral':
|
| 543 |
+
oral_count += 1
|
| 544 |
+
oral_weighted += cat_confidence
|
| 545 |
+
else:
|
| 546 |
+
literate_count += 1
|
| 547 |
+
literate_weighted += cat_confidence
|
| 548 |
+
|
| 549 |
+
sentence_results.append({
|
| 550 |
+
'text': sent,
|
| 551 |
+
'category': category,
|
| 552 |
+
'category_confidence': round(cat_confidence, 3),
|
| 553 |
+
'marker': markers[0]['marker'],
|
| 554 |
+
'confidence': markers[0]['confidence'],
|
| 555 |
+
'markers': markers,
|
| 556 |
+
'attention_spans': attention_spans,
|
| 557 |
+
'spans': spans,
|
| 558 |
+
})
|
| 559 |
+
|
| 560 |
+
# Ensemble scoring
|
| 561 |
+
total = oral_count + literate_count
|
| 562 |
+
total_weighted = oral_weighted + literate_weighted
|
| 563 |
+
sentence_ratio_binary = oral_count / total if total > 0 else 0.5
|
| 564 |
+
sentence_ratio_weighted = oral_weighted / total_weighted if total_weighted > 0 else 0.5
|
| 565 |
+
ensemble_score = (DOC_MODEL_WEIGHT * doc_score) + (SENTENCE_WEIGHT * sentence_ratio_weighted)
|
| 566 |
+
|
| 567 |
+
result = {
|
| 568 |
+
'score': round(ensemble_score * 100),
|
| 569 |
+
'doc_score': round(doc_score, 3),
|
| 570 |
+
'sentence_ratio': round(sentence_ratio_weighted, 3),
|
| 571 |
+
'sentence_ratio_binary': round(sentence_ratio_binary, 3),
|
| 572 |
+
'oral_count': oral_count,
|
| 573 |
+
'literate_count': literate_count,
|
| 574 |
+
'oral_weighted': round(oral_weighted, 3),
|
| 575 |
+
'literate_weighted': round(literate_weighted, 3),
|
| 576 |
+
'sentences': sentence_results
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
if lang_warning:
|
| 580 |
+
result['language_warning'] = lang_warning
|
| 581 |
+
result['detected_language'] = lang_code
|
| 582 |
+
|
| 583 |
+
if truncated:
|
| 584 |
+
result['truncation_warning'] = f"This text has {total_sentence_count} sentences. Only the first {MAX_SENTENCES} were analyzed."
|
| 585 |
+
result['total_sentences'] = total_sentence_count
|
| 586 |
+
result['analyzed_sentences'] = MAX_SENTENCES
|
| 587 |
+
|
| 588 |
+
log_usage(text, result['score'], oral_count, literate_count, sentence_results)
|
| 589 |
+
|
| 590 |
+
return result
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
def analyze_text(text):
|
| 594 |
+
"""Main analysis function for Gradio UI."""
|
| 595 |
+
if not text or len(text.strip()) < 10:
|
| 596 |
+
return "Please enter some text to analyze.", "", ""
|
| 597 |
+
|
| 598 |
+
if len(text) > MAX_INPUT_CHARS:
|
| 599 |
+
return f"Text too long. Maximum {MAX_INPUT_CHARS:,} characters allowed.", "", ""
|
| 600 |
+
|
| 601 |
+
lang_code, is_english, lang_warning = detect_language(text)
|
| 602 |
+
doc_score = predict_doc_score(text)
|
| 603 |
+
|
| 604 |
+
sentences = split_sentences(text)
|
| 605 |
+
total_sentence_count = len(sentences)
|
| 606 |
+
truncated = total_sentence_count > MAX_SENTENCES
|
| 607 |
+
if truncated:
|
| 608 |
+
sentences = sentences[:MAX_SENTENCES]
|
| 609 |
+
|
| 610 |
+
oral_count = 0
|
| 611 |
+
literate_count = 0
|
| 612 |
+
oral_weighted = 0.0
|
| 613 |
+
literate_weighted = 0.0
|
| 614 |
+
sentence_results = []
|
| 615 |
+
|
| 616 |
+
for sent in sentences:
|
| 617 |
+
word_count = len(sent.split())
|
| 618 |
+
if word_count < 3 or word_count > MAX_WORDS_PER_SENTENCE:
|
| 619 |
+
continue
|
| 620 |
+
|
| 621 |
+
category, cat_confidence = predict_category(sent)
|
| 622 |
+
markers = predict_subtype(sent)
|
| 623 |
+
|
| 624 |
+
if category == 'oral':
|
| 625 |
+
oral_count += 1
|
| 626 |
+
oral_weighted += cat_confidence
|
| 627 |
+
else:
|
| 628 |
+
literate_count += 1
|
| 629 |
+
literate_weighted += cat_confidence
|
| 630 |
+
|
| 631 |
+
sentence_results.append({
|
| 632 |
+
'sentence': sent[:200] + '...' if len(sent) > 200 else sent,
|
| 633 |
+
'category': category,
|
| 634 |
+
'cat_confidence': cat_confidence,
|
| 635 |
+
'subtype': markers[0]['marker'],
|
| 636 |
+
'confidence': markers[0]['confidence']
|
| 637 |
+
})
|
| 638 |
+
|
| 639 |
+
total = oral_count + literate_count
|
| 640 |
+
total_weighted = oral_weighted + literate_weighted
|
| 641 |
+
sentence_ratio = oral_weighted / total_weighted if total_weighted > 0 else 0.5
|
| 642 |
+
ensemble_score = (DOC_MODEL_WEIGHT * doc_score) + (SENTENCE_WEIGHT * sentence_ratio)
|
| 643 |
+
|
| 644 |
+
if ensemble_score >= 0.65:
|
| 645 |
+
mode = "ORAL"
|
| 646 |
+
mode_desc = "High oral characteristics - repetition, direct address, concrete imagery"
|
| 647 |
+
color = "#228B22"
|
| 648 |
+
elif ensemble_score >= 0.35:
|
| 649 |
+
mode = "MIXED"
|
| 650 |
+
mode_desc = "Mixed oral and literate characteristics"
|
| 651 |
+
color = "#B8860B"
|
| 652 |
+
else:
|
| 653 |
+
mode = "LITERATE"
|
| 654 |
+
mode_desc = "High literate characteristics - abstraction, subordination, hedging"
|
| 655 |
+
color = "#4169E1"
|
| 656 |
+
|
| 657 |
+
bar_len = int(ensemble_score * 30)
|
| 658 |
+
bar = "\u2588" * bar_len + "\u2591" * (30 - bar_len)
|
| 659 |
+
|
| 660 |
+
warnings_html = ""
|
| 661 |
+
if lang_warning:
|
| 662 |
+
warnings_html += f"""
|
| 663 |
+
<div style="background: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; padding: 12px; border-radius: 8px; margin-bottom: 15px;">
|
| 664 |
+
<strong>Warning:</strong> {lang_warning}
|
| 665 |
+
</div>
|
| 666 |
+
"""
|
| 667 |
+
if truncated:
|
| 668 |
+
warnings_html += f"""
|
| 669 |
+
<div style="background: #fff3cd; border: 1px solid #ffc107; color: #856404; padding: 12px; border-radius: 8px; margin-bottom: 15px;">
|
| 670 |
+
<strong>Note:</strong> This text has {total_sentence_count} sentences. Only the first {MAX_SENTENCES} were analyzed.
|
| 671 |
+
</div>
|
| 672 |
+
"""
|
| 673 |
+
|
| 674 |
+
score_html = f"""
|
| 675 |
+
<div style="font-family: system-ui; padding: 20px; background: #f8f9fa; border-radius: 10px;">
|
| 676 |
+
{warnings_html}
|
| 677 |
+
<h2 style="color: {color}; margin-bottom: 10px;">Orality Score: {ensemble_score:.2f} ({mode})</h2>
|
| 678 |
+
<p style="color: #666; margin-bottom: 15px;">{mode_desc}</p>
|
| 679 |
+
<div style="font-family: monospace; font-size: 14px; margin-bottom: 15px;">
|
| 680 |
+
<span style="color: #4169E1;">Literate</span> [{bar}] <span style="color: #228B22;">Oral</span>
|
| 681 |
+
</div>
|
| 682 |
+
<div style="font-size: 13px; color: #555;">
|
| 683 |
+
<strong>Score Components:</strong><br>
|
| 684 |
+
- Sentence analysis: {oral_count}/{total} oral ({sentence_ratio:.0%}) [confidence-weighted]<br>
|
| 685 |
+
- Document model: {doc_score:.2f}<br>
|
| 686 |
+
- Ensemble weights: {int(SENTENCE_WEIGHT*100)}% sentence + {int(DOC_MODEL_WEIGHT*100)}% document
|
| 687 |
+
</div>
|
| 688 |
+
</div>
|
| 689 |
+
"""
|
| 690 |
+
|
| 691 |
+
if sentence_results:
|
| 692 |
+
rows = ""
|
| 693 |
+
for r in sentence_results[:20]:
|
| 694 |
+
cat_color = "#228B22" if r['category'] == 'oral' else "#4169E1"
|
| 695 |
+
rows += f"""
|
| 696 |
+
<tr>
|
| 697 |
+
<td style="color: {cat_color}; font-weight: bold; padding: 8px;">{r['category'].upper()}</td>
|
| 698 |
+
<td style="padding: 8px;">{r['subtype']} ({r['confidence']:.0%})</td>
|
| 699 |
+
<td style="padding: 8px; font-style: italic;">"{r['sentence']}"</td>
|
| 700 |
+
</tr>
|
| 701 |
+
"""
|
| 702 |
+
|
| 703 |
+
sentences_html = f"""
|
| 704 |
+
<div style="font-family: system-ui; padding: 20px; background: #f8f9fa; border-radius: 10px; margin-top: 20px;">
|
| 705 |
+
<h3 style="margin-bottom: 15px;">Sentence-Level Analysis</h3>
|
| 706 |
+
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
|
| 707 |
+
<tr style="background: #e9ecef;">
|
| 708 |
+
<th style="padding: 8px; text-align: left; width: 80px;">Category</th>
|
| 709 |
+
<th style="padding: 8px; text-align: left; width: 150px;">Marker Type</th>
|
| 710 |
+
<th style="padding: 8px; text-align: left;">Sentence</th>
|
| 711 |
+
</tr>
|
| 712 |
+
{rows}
|
| 713 |
+
</table>
|
| 714 |
+
{f'<p style="color: #666; margin-top: 10px; font-size: 12px;">...and {len(sentence_results) - 20} more sentences analyzed</p>' if len(sentence_results) > 20 else ''}
|
| 715 |
+
</div>
|
| 716 |
+
"""
|
| 717 |
+
else:
|
| 718 |
+
sentences_html = ""
|
| 719 |
+
|
| 720 |
+
reference_html = """
|
| 721 |
+
<div style="font-family: system-ui; padding: 20px; background: #fff3cd; border-radius: 10px; margin-top: 20px;">
|
| 722 |
+
<h4 style="margin-bottom: 10px;">Reference: Orality Scores by Genre</h4>
|
| 723 |
+
<table style="font-size: 12px; width: 100%;">
|
| 724 |
+
<tr><td><strong>0.9+</strong></td><td>Epic poetry, hip-hop, spoken word</td></tr>
|
| 725 |
+
<tr><td><strong>0.7-0.9</strong></td><td>Speeches, sermons, podcasts</td></tr>
|
| 726 |
+
<tr><td><strong>0.4-0.7</strong></td><td>Essays, blogs, casual writing</td></tr>
|
| 727 |
+
<tr><td><strong>0.1-0.4</strong></td><td>Journalism, technical writing</td></tr>
|
| 728 |
+
<tr><td><strong><0.1</strong></td><td>Academic papers, legal documents, philosophy</td></tr>
|
| 729 |
+
</table>
|
| 730 |
+
</div>
|
| 731 |
+
"""
|
| 732 |
+
|
| 733 |
+
return score_html, sentences_html, reference_html
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
# Example texts
|
| 737 |
+
examples = [
|
| 738 |
+
["Tell me, O Muse, of that ingenious hero who travelled far and wide after he had sacked the famous town of Troy. Many cities did he visit, and many were the nations with whose manners and customs he was acquainted."],
|
| 739 |
+
["We will fight on the beaches, we will fight on the landing grounds, we will fight in the fields and in the streets, we will fight in the hills; we will never surrender."],
|
| 740 |
+
["The analysis of variance revealed a statistically significant effect of treatment condition on participant response latency, F(2, 147) = 4.23, p < .05, suggesting that the experimental manipulation influenced cognitive processing speed."],
|
| 741 |
+
["So like, I was just thinking about this the other day, right? And it's crazy because we never really talk about how much social media has changed everything. You know what I mean?"],
|
| 742 |
+
]
|
| 743 |
+
|
| 744 |
+
# Build interface
|
| 745 |
+
with gr.Blocks(title="Havelock.AI - Orality Analyzer (Substring)", theme=gr.themes.Soft()) as demo:
|
| 746 |
+
gr.Markdown("""
|
| 747 |
+
# Havelock.AI - Orality Analyzer (Substring Staging)
|
| 748 |
+
|
| 749 |
+
Analyze text for **oral vs literate characteristics** based on Walter Ong's linguistic framework.
|
| 750 |
+
|
| 751 |
+
**v1.7**: Token-level span predictions for Tier 1 markers (F1 >= 0.50).
|
| 752 |
+
Use the API endpoint for span data — the Gradio UI shows sentence-level results only.
|
| 753 |
+
""")
|
| 754 |
+
|
| 755 |
+
with gr.Row():
|
| 756 |
+
with gr.Column(scale=1):
|
| 757 |
+
text_input = gr.Textbox(
|
| 758 |
+
label="Enter text to analyze",
|
| 759 |
+
placeholder="Paste your text here...",
|
| 760 |
+
lines=8
|
| 761 |
+
)
|
| 762 |
+
analyze_btn = gr.Button("Analyze", variant="primary")
|
| 763 |
+
gr.Examples(examples, inputs=text_input, label="Try these examples")
|
| 764 |
+
|
| 765 |
+
with gr.Column(scale=1):
|
| 766 |
+
score_output = gr.HTML(label="Orality Score")
|
| 767 |
+
|
| 768 |
+
sentences_output = gr.HTML(label="Sentence Analysis")
|
| 769 |
+
reference_output = gr.HTML(label="Reference")
|
| 770 |
+
|
| 771 |
+
analyze_btn.click(
|
| 772 |
+
fn=analyze_text,
|
| 773 |
+
inputs=text_input,
|
| 774 |
+
outputs=[score_output, sentences_output, reference_output]
|
| 775 |
+
)
|
| 776 |
+
|
| 777 |
+
# Hidden API interface for website integration
|
| 778 |
+
api_input = gr.Textbox(visible=False)
|
| 779 |
+
api_output = gr.JSON(visible=False)
|
| 780 |
+
api_btn = gr.Button(visible=False)
|
| 781 |
+
api_btn.click(fn=analyze_api, inputs=api_input, outputs=api_output, api_name="analyze")
|
| 782 |
+
|
| 783 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
transformers
|
| 3 |
+
huggingface_hub
|
| 4 |
+
langdetect
|
| 5 |
+
safetensors
|