ritammehta commited on
Commit
6ed7580
·
verified ·
1 Parent(s): 7195696

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +4 -6
  2. app.py +783 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
  title: Havelock Demo Substring
3
- emoji: 🏃
4
- colorFrom: green
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Havelock Demo Substring
3
+ emoji: 🔬
4
+ colorFrom: purple
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
 
app.py ADDED
@@ -0,0 +1,783 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # v1.7 - Substring-level marker highlighting
2
+ """
3
+ Havelock.AI - Orality Analyzer (Substring Staging)
4
+
5
+ Adds token-level span predictions from the trained MultiLabelTokenClassifier
6
+ (HavelockAI/bert-token-classifier) alongside existing sentence-level models.
7
+
8
+ Only Tier 1 markers (F1 >= 0.50) produce substring spans.
9
+ """
10
+
11
+ import gradio as gr
12
+ import torch
13
+ import torch.nn as nn
14
+ from transformers import (
15
+ AutoModel,
16
+ AutoTokenizer,
17
+ BertTokenizerFast,
18
+ BertModel,
19
+ BertForSequenceClassification,
20
+ )
21
+ from huggingface_hub import hf_hub_download
22
+ import json
23
+ import re
24
+ import threading
25
+ import time
26
+ import random
27
+
28
+ # Tracking endpoint
29
+ TRACK_URL = "https://havelock.ai/api/track"
30
+
31
+ # Language detection
32
+ try:
33
+ from langdetect import detect, DetectorFactory
34
+ DetectorFactory.seed = 0
35
+ HAS_LANGDETECT = True
36
+ except ImportError:
37
+ HAS_LANGDETECT = False
38
+ print("Warning: langdetect not installed. Language detection disabled.")
39
+
40
+ # Model repositories
41
+ MODEL_REPO = "thestalwart/havelock-orality"
42
+ TOKEN_MODEL_REPO = "HavelockAI/bert-token-classifier"
43
+
44
+ # Device
45
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
+
47
+ # Ensemble weights
48
+ DOC_MODEL_WEIGHT = 0.35
49
+ SENTENCE_WEIGHT = 0.65
50
+
51
+ # Sentence analysis cap
52
+ MAX_SENTENCES = 100
53
+ MAX_INPUT_CHARS = 50000
54
+ MAX_WORDS_PER_SENTENCE = 150
55
+
56
+ # Tier 1 markers: F1 >= 0.50 from manifest
57
+ TIER1_MARKERS = {
58
+ "oral_vocative",
59
+ "literate_technical_abbreviation",
60
+ "oral_phatic_check",
61
+ "oral_imperative",
62
+ "oral_specific_place",
63
+ "literate_citation",
64
+ "literate_agentless_passive",
65
+ "oral_rhetorical_question",
66
+ "oral_inclusive_we",
67
+ "oral_second_person",
68
+ "oral_named_individual",
69
+ "literate_nominalization",
70
+ "literate_probability",
71
+ }
72
+
73
+
74
+ class BertOralityRegressor(nn.Module):
75
+ """BERT model with regression head for orality scoring."""
76
+ def __init__(self, bert_model_name='bert-base-uncased', dropout=0.1):
77
+ super().__init__()
78
+ self.bert = BertModel.from_pretrained(bert_model_name)
79
+ self.dropout = nn.Dropout(dropout)
80
+ self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
81
+ self.sigmoid = nn.Sigmoid()
82
+
83
+ def forward(self, input_ids, attention_mask):
84
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
85
+ pooled_output = outputs.pooler_output
86
+ pooled_output = self.dropout(pooled_output)
87
+ logits = self.regressor(pooled_output)
88
+ return self.sigmoid(logits).squeeze(-1)
89
+
90
+
91
+ def load_models():
92
+ """Download and load all models from HuggingFace Hub."""
93
+ print("Loading sentence-level models...")
94
+
95
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
96
+
97
+ # Load document regressor
98
+ doc_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_orality_regressor.pt")
99
+ doc_model = BertOralityRegressor().to(DEVICE)
100
+ doc_model.load_state_dict(torch.load(doc_model_path, map_location=DEVICE))
101
+ doc_model.eval()
102
+
103
+ # Load category classifier
104
+ cat_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_category.pt")
105
+ cat_labels_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_category_labels.json")
106
+ with open(cat_labels_path) as f:
107
+ category_labels = json.load(f)
108
+ category_model = BertForSequenceClassification.from_pretrained(
109
+ 'bert-base-uncased', num_labels=len(category_labels)
110
+ ).to(DEVICE)
111
+ category_model.load_state_dict(torch.load(cat_model_path, map_location=DEVICE))
112
+ category_model.eval()
113
+
114
+ # Load subtype classifier
115
+ sub_model_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_subtype.pt")
116
+ sub_labels_path = hf_hub_download(repo_id=MODEL_REPO, filename="bert_marker_subtype_labels.json")
117
+ with open(sub_labels_path) as f:
118
+ subtype_labels = json.load(f)
119
+ subtype_model = BertForSequenceClassification.from_pretrained(
120
+ 'bert-base-uncased', num_labels=len(subtype_labels),
121
+ attn_implementation="eager"
122
+ ).to(DEVICE)
123
+ subtype_model.load_state_dict(torch.load(sub_model_path, map_location=DEVICE))
124
+ subtype_model.eval()
125
+
126
+ print("Sentence-level models loaded!")
127
+ return tokenizer, doc_model, category_model, category_labels, subtype_model, subtype_labels
128
+
129
+
130
+ def load_token_classifier():
131
+ """Load the token-level classifier from HuggingFace Hub."""
132
+ print("Loading token classifier...")
133
+
134
+ tok_tokenizer = AutoTokenizer.from_pretrained(TOKEN_MODEL_REPO)
135
+ tok_model = AutoModel.from_pretrained(TOKEN_MODEL_REPO, trust_remote_code=True)
136
+ tok_model.to(DEVICE)
137
+ tok_model.eval()
138
+
139
+ type_map_path = hf_hub_download(TOKEN_MODEL_REPO, "type_to_idx.json")
140
+ with open(type_map_path) as f:
141
+ type_to_idx = json.load(f)
142
+ idx_to_type = {v: k for k, v in type_to_idx.items()}
143
+
144
+ print(f"Token classifier loaded! ({len(type_to_idx)} marker types)")
145
+ return tok_tokenizer, tok_model, idx_to_type
146
+
147
+
148
+ # Load all models at startup
149
+ tokenizer, doc_model, category_model, category_labels, subtype_model, subtype_labels = load_models()
150
+ tok_tokenizer, tok_model, idx_to_type = load_token_classifier()
151
+
152
+
153
+ def split_sentences(text):
154
+ """Split text into sentences."""
155
+ sentences = re.split(r'(?<=[.!?])\s+', text)
156
+ return [s.strip() for s in sentences if s.strip()]
157
+
158
+
159
+ def predict_doc_score(text):
160
+ """Predict document-level orality score with chunking for long texts."""
161
+ full_encoding = tokenizer(text, truncation=False, return_tensors='pt')
162
+ total_tokens = full_encoding['input_ids'].shape[1]
163
+
164
+ if total_tokens <= 512:
165
+ encoding = tokenizer(
166
+ text, truncation=True, max_length=512,
167
+ padding='max_length', return_tensors='pt'
168
+ )
169
+ with torch.no_grad():
170
+ score = doc_model(
171
+ encoding['input_ids'].to(DEVICE),
172
+ encoding['attention_mask'].to(DEVICE)
173
+ )
174
+ return score.item()
175
+
176
+ input_ids = full_encoding['input_ids'][0]
177
+ chunk_size = 512
178
+ stride = 448
179
+ scores = []
180
+
181
+ for start in range(0, total_tokens, stride):
182
+ end = min(start + chunk_size, total_tokens)
183
+ chunk_ids = input_ids[start:end].unsqueeze(0)
184
+
185
+ if chunk_ids.shape[1] < chunk_size:
186
+ pad_length = chunk_size - chunk_ids.shape[1]
187
+ chunk_ids = torch.nn.functional.pad(chunk_ids, (0, pad_length), value=tokenizer.pad_token_id)
188
+
189
+ attention_mask = (chunk_ids != tokenizer.pad_token_id).long()
190
+
191
+ with torch.no_grad():
192
+ score = doc_model(
193
+ chunk_ids.to(DEVICE),
194
+ attention_mask.to(DEVICE)
195
+ )
196
+ scores.append(score.item())
197
+
198
+ if end >= total_tokens:
199
+ break
200
+
201
+ return sum(scores) / len(scores)
202
+
203
+
204
+ def predict_category(text):
205
+ """Predict oral vs literate for a sentence with confidence."""
206
+ encoding = tokenizer(
207
+ text, truncation=True, max_length=128,
208
+ padding='max_length', return_tensors='pt'
209
+ )
210
+ with torch.no_grad():
211
+ outputs = category_model(
212
+ encoding['input_ids'].to(DEVICE),
213
+ encoding['attention_mask'].to(DEVICE)
214
+ )
215
+ probs = torch.softmax(outputs.logits, dim=1)
216
+ pred = torch.argmax(probs, dim=1).item()
217
+ confidence = probs[0][pred].item()
218
+
219
+ id_to_label = {v: k for k, v in category_labels.items()}
220
+ return id_to_label[pred], confidence
221
+
222
+
223
+ def predict_subtype(text, top_k=3, threshold=0.05):
224
+ """Predict marker subtype(s) for a sentence."""
225
+ encoding = tokenizer(
226
+ text, truncation=True, max_length=128,
227
+ padding='max_length', return_tensors='pt'
228
+ )
229
+ with torch.no_grad():
230
+ outputs = subtype_model(
231
+ encoding['input_ids'].to(DEVICE),
232
+ encoding['attention_mask'].to(DEVICE)
233
+ )
234
+ probs = torch.softmax(outputs.logits, dim=1)
235
+ top_probs, top_indices = torch.topk(probs, k=top_k, dim=1)
236
+
237
+ id_to_label = {v: k for k, v in subtype_labels.items()}
238
+
239
+ markers = []
240
+ for prob, idx in zip(top_probs[0], top_indices[0]):
241
+ conf = prob.item()
242
+ if conf >= threshold:
243
+ markers.append({
244
+ 'marker': id_to_label[idx.item()],
245
+ 'confidence': round(conf, 3)
246
+ })
247
+
248
+ if not markers:
249
+ markers = [{
250
+ 'marker': id_to_label[top_indices[0][0].item()],
251
+ 'confidence': round(top_probs[0][0].item(), 3)
252
+ }]
253
+
254
+ return markers
255
+
256
+
257
+ def predict_spans(text):
258
+ """Run token classifier and return character-level spans for Tier 1 markers.
259
+
260
+ Returns a list of span dicts with character offsets into the original text.
261
+ Only markers in TIER1_MARKERS are included.
262
+ """
263
+ encoding = tok_tokenizer(
264
+ text,
265
+ return_tensors="pt",
266
+ truncation=True,
267
+ max_length=128,
268
+ return_offsets_mapping=True,
269
+ )
270
+
271
+ offset_mapping = encoding.pop("offset_mapping")[0] # (seq_len, 2)
272
+ input_ids = encoding["input_ids"].to(DEVICE)
273
+ attention_mask = encoding["attention_mask"].to(DEVICE)
274
+
275
+ with torch.no_grad():
276
+ if hasattr(tok_model, "decode"):
277
+ preds = tok_model.decode(input_ids, attention_mask)
278
+ else:
279
+ logits = tok_model(input_ids, attention_mask)
280
+ preds = logits.argmax(dim=-1)
281
+
282
+ # preds shape: (1, seq_len, num_types) where values are 0=O, 1=B, 2=I
283
+ preds = preds[0] # (seq_len, num_types)
284
+ seq_len = attention_mask.sum().item()
285
+
286
+ # Collect spans per marker type using BIO transitions
287
+ spans = []
288
+
289
+ for type_idx in range(preds.shape[1]):
290
+ marker_name = idx_to_type.get(type_idx)
291
+ if marker_name is None or marker_name not in TIER1_MARKERS:
292
+ continue
293
+
294
+ span_start_tok = None
295
+
296
+ for tok_pos in range(seq_len):
297
+ tag = preds[tok_pos, type_idx].item()
298
+ offsets = offset_mapping[tok_pos].tolist()
299
+
300
+ # Skip special tokens (offset 0,0)
301
+ if offsets[0] == 0 and offsets[1] == 0 and tok_pos > 0:
302
+ if span_start_tok is not None:
303
+ _emit_token_span(spans, text, offset_mapping, span_start_tok, tok_pos, marker_name)
304
+ span_start_tok = None
305
+ continue
306
+
307
+ if tag == 1: # B
308
+ if span_start_tok is not None:
309
+ _emit_token_span(spans, text, offset_mapping, span_start_tok, tok_pos, marker_name)
310
+ span_start_tok = tok_pos
311
+ elif tag == 2: # I
312
+ if span_start_tok is None:
313
+ span_start_tok = tok_pos # orphan I, treat as B
314
+ else: # O
315
+ if span_start_tok is not None:
316
+ _emit_token_span(spans, text, offset_mapping, span_start_tok, tok_pos, marker_name)
317
+ span_start_tok = None
318
+
319
+ if span_start_tok is not None:
320
+ _emit_token_span(spans, text, offset_mapping, span_start_tok, seq_len, marker_name)
321
+
322
+ # Sort by start position
323
+ spans.sort(key=lambda s: (s["start"], s["end"]))
324
+ return spans
325
+
326
+
327
+ def _emit_token_span(spans, text, offset_mapping, start_tok, end_tok, marker_name):
328
+ """Convert token indices to a character-level span dict."""
329
+ char_start = int(offset_mapping[start_tok][0])
330
+ char_end = int(offset_mapping[end_tok - 1][1])
331
+ if char_end > char_start:
332
+ span_text = text[char_start:char_end]
333
+ category = "oral" if marker_name.startswith("oral_") else "literate"
334
+ spans.append({
335
+ "text": span_text,
336
+ "marker": marker_name,
337
+ "category": category,
338
+ "start": char_start,
339
+ "end": char_end,
340
+ })
341
+
342
+
343
+ def get_attention_spans(text, threshold=0.15):
344
+ """Extract high-attention token spans from the subtype model."""
345
+ encoding = tokenizer(
346
+ text, truncation=True, max_length=128,
347
+ padding='max_length', return_tensors='pt',
348
+ return_offsets_mapping=True
349
+ )
350
+
351
+ offset_mapping = encoding.pop('offset_mapping')[0]
352
+ input_ids = encoding['input_ids'].to(DEVICE)
353
+ attention_mask = encoding['attention_mask'].to(DEVICE)
354
+
355
+ with torch.no_grad():
356
+ outputs = subtype_model.bert(
357
+ input_ids=input_ids,
358
+ attention_mask=attention_mask,
359
+ output_attentions=True
360
+ )
361
+
362
+ last_layer = outputs.attentions[-1][0]
363
+ cls_attention = last_layer.mean(dim=0)[0]
364
+
365
+ max_attn = cls_attention.max()
366
+ if max_attn > 0:
367
+ cls_attention = cls_attention / max_attn
368
+
369
+ seq_len = attention_mask.sum().item()
370
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[0][:seq_len])
371
+ offsets = offset_mapping[:seq_len].tolist()
372
+ attn_values = cls_attention[:seq_len].tolist()
373
+
374
+ PUNCT = {'.', ',', '!', '?', ';', ':', '-', '--', "'", '"',
375
+ '(', ')', '[', ']', '...', '\u2013', '\u2014'}
376
+
377
+ spans = []
378
+ in_span = False
379
+ span_start_idx = 0
380
+ span_attentions = []
381
+
382
+ for i, (token, attn, (cs, ce)) in enumerate(zip(tokens, attn_values, offsets)):
383
+ if cs == 0 and ce == 0:
384
+ if in_span and span_attentions:
385
+ _emit_attn_span(spans, text, offsets, span_start_idx, i, span_attentions)
386
+ in_span = False
387
+ span_attentions = []
388
+ continue
389
+
390
+ if token in PUNCT:
391
+ if in_span and span_attentions:
392
+ _emit_attn_span(spans, text, offsets, span_start_idx, i, span_attentions)
393
+ in_span = False
394
+ span_attentions = []
395
+ continue
396
+
397
+ if attn >= threshold:
398
+ if not in_span:
399
+ in_span = True
400
+ span_start_idx = i
401
+ span_attentions = [attn]
402
+ else:
403
+ span_attentions.append(attn)
404
+ else:
405
+ if in_span and span_attentions:
406
+ _emit_attn_span(spans, text, offsets, span_start_idx, i, span_attentions)
407
+ in_span = False
408
+ span_attentions = []
409
+
410
+ if in_span and span_attentions:
411
+ _emit_attn_span(spans, text, offsets, span_start_idx, seq_len, span_attentions)
412
+
413
+ return spans
414
+
415
+
416
+ def _emit_attn_span(spans, text, offsets, start_idx, end_idx, attentions):
417
+ """Helper to build an attention span dict from token indices."""
418
+ char_start = int(offsets[start_idx][0])
419
+ char_end = int(offsets[end_idx - 1][1])
420
+ if char_end > char_start:
421
+ spans.append({
422
+ 'char_start': char_start,
423
+ 'char_end': char_end,
424
+ 'text': text[char_start:char_end],
425
+ 'attention': round(sum(attentions) / len(attentions), 3)
426
+ })
427
+
428
+
429
+ def detect_language(text):
430
+ """Detect language of text."""
431
+ if not HAS_LANGDETECT:
432
+ return None, True, None
433
+ try:
434
+ if len(text.split()) < 10:
435
+ return None, True, None
436
+ lang = detect(text[:1000])
437
+ is_english = lang == 'en'
438
+ if not is_english:
439
+ warning = f"Non-English text detected ({lang}). Results may be unreliable as the model was trained primarily on English text."
440
+ return lang, False, warning
441
+ return lang, True, None
442
+ except Exception:
443
+ return None, True, None
444
+
445
+
446
+ def log_usage(text, score, oral_count, literate_count, sentence_results):
447
+ """Log API usage to Cloudflare D1 via track endpoint (fire and forget)."""
448
+ def _send():
449
+ try:
450
+ import urllib.request
451
+ import urllib.error
452
+
453
+ words = text.split() if text else []
454
+ sentences = [s for s in re.split(r'[.!?]+', text) if s.strip()] if text else []
455
+ unique_words = len(set(w.lower() for w in words if w.isalpha()))
456
+
457
+ word_count = len(words)
458
+ sentence_count = len(sentences)
459
+ avg_sentence_length = round(word_count / sentence_count, 1) if sentence_count else 0
460
+ alpha_words = [w for w in words if w.isalpha()]
461
+ avg_word_length = round(sum(len(w) for w in alpha_words) / len(alpha_words), 1) if alpha_words else 0
462
+ lexical_diversity = round((unique_words / word_count) * 100) if word_count else 0
463
+
464
+ marker_counts = {}
465
+ for sent in sentence_results:
466
+ marker = sent.get('marker', '')
467
+ if marker:
468
+ marker_counts[marker] = marker_counts.get(marker, 0) + 1
469
+
470
+ analysis_id = hex(int(time.time()))[2:] + hex(random.randint(0, 0xFFFFFFFF))[2:]
471
+
472
+ payload = {
473
+ "page": "huggingface_api",
474
+ "text": text[:10000] if text else "",
475
+ "score": score,
476
+ "word_count": word_count,
477
+ "sentence_count": sentence_count,
478
+ "avg_sentence_length": avg_sentence_length,
479
+ "avg_word_length": avg_word_length,
480
+ "lexical_diversity": lexical_diversity,
481
+ "oral_marker_count": oral_count,
482
+ "literate_marker_count": literate_count,
483
+ "markers_json": json.dumps(marker_counts) if marker_counts else None,
484
+ "analysis_id": analysis_id,
485
+ }
486
+
487
+ data = json.dumps(payload).encode('utf-8')
488
+ req = urllib.request.Request(
489
+ TRACK_URL,
490
+ data=data,
491
+ headers={'Content-Type': 'application/json', 'User-Agent': 'HavelockSpace/1.7'},
492
+ method='POST'
493
+ )
494
+ urllib.request.urlopen(req, timeout=5)
495
+ except Exception as e:
496
+ print(f"Tracking failed: {e}")
497
+
498
+ thread = threading.Thread(target=_send, daemon=True)
499
+ thread.start()
500
+
501
+
502
+ def analyze_api(text):
503
+ """JSON API for website integration.
504
+
505
+ v1.7: Adds spans field with token-level marker predictions (Tier 1 only).
506
+ """
507
+ if not text or len(text.strip()) < 10:
508
+ return {"error": "Please enter at least 10 characters of text."}
509
+
510
+ if len(text) > MAX_INPUT_CHARS:
511
+ return {"error": f"Text too long. Maximum {MAX_INPUT_CHARS:,} characters allowed ({len(text):,} provided)."}
512
+
513
+ # Language detection
514
+ lang_code, is_english, lang_warning = detect_language(text)
515
+
516
+ # Document-level score
517
+ doc_score = predict_doc_score(text)
518
+
519
+ # Sentence-level analysis
520
+ sentences = split_sentences(text)
521
+ total_sentence_count = len(sentences)
522
+ truncated = total_sentence_count > MAX_SENTENCES
523
+ if truncated:
524
+ sentences = sentences[:MAX_SENTENCES]
525
+
526
+ oral_count = 0
527
+ literate_count = 0
528
+ oral_weighted = 0.0
529
+ literate_weighted = 0.0
530
+ sentence_results = []
531
+
532
+ for sent in sentences:
533
+ word_count = len(sent.split())
534
+ if word_count < 3 or word_count > MAX_WORDS_PER_SENTENCE:
535
+ continue
536
+
537
+ category, cat_confidence = predict_category(sent)
538
+ markers = predict_subtype(sent)
539
+ attention_spans = get_attention_spans(sent)
540
+ spans = predict_spans(sent)
541
+
542
+ if category == 'oral':
543
+ oral_count += 1
544
+ oral_weighted += cat_confidence
545
+ else:
546
+ literate_count += 1
547
+ literate_weighted += cat_confidence
548
+
549
+ sentence_results.append({
550
+ 'text': sent,
551
+ 'category': category,
552
+ 'category_confidence': round(cat_confidence, 3),
553
+ 'marker': markers[0]['marker'],
554
+ 'confidence': markers[0]['confidence'],
555
+ 'markers': markers,
556
+ 'attention_spans': attention_spans,
557
+ 'spans': spans,
558
+ })
559
+
560
+ # Ensemble scoring
561
+ total = oral_count + literate_count
562
+ total_weighted = oral_weighted + literate_weighted
563
+ sentence_ratio_binary = oral_count / total if total > 0 else 0.5
564
+ sentence_ratio_weighted = oral_weighted / total_weighted if total_weighted > 0 else 0.5
565
+ ensemble_score = (DOC_MODEL_WEIGHT * doc_score) + (SENTENCE_WEIGHT * sentence_ratio_weighted)
566
+
567
+ result = {
568
+ 'score': round(ensemble_score * 100),
569
+ 'doc_score': round(doc_score, 3),
570
+ 'sentence_ratio': round(sentence_ratio_weighted, 3),
571
+ 'sentence_ratio_binary': round(sentence_ratio_binary, 3),
572
+ 'oral_count': oral_count,
573
+ 'literate_count': literate_count,
574
+ 'oral_weighted': round(oral_weighted, 3),
575
+ 'literate_weighted': round(literate_weighted, 3),
576
+ 'sentences': sentence_results
577
+ }
578
+
579
+ if lang_warning:
580
+ result['language_warning'] = lang_warning
581
+ result['detected_language'] = lang_code
582
+
583
+ if truncated:
584
+ result['truncation_warning'] = f"This text has {total_sentence_count} sentences. Only the first {MAX_SENTENCES} were analyzed."
585
+ result['total_sentences'] = total_sentence_count
586
+ result['analyzed_sentences'] = MAX_SENTENCES
587
+
588
+ log_usage(text, result['score'], oral_count, literate_count, sentence_results)
589
+
590
+ return result
591
+
592
+
593
+ def analyze_text(text):
594
+ """Main analysis function for Gradio UI."""
595
+ if not text or len(text.strip()) < 10:
596
+ return "Please enter some text to analyze.", "", ""
597
+
598
+ if len(text) > MAX_INPUT_CHARS:
599
+ return f"Text too long. Maximum {MAX_INPUT_CHARS:,} characters allowed.", "", ""
600
+
601
+ lang_code, is_english, lang_warning = detect_language(text)
602
+ doc_score = predict_doc_score(text)
603
+
604
+ sentences = split_sentences(text)
605
+ total_sentence_count = len(sentences)
606
+ truncated = total_sentence_count > MAX_SENTENCES
607
+ if truncated:
608
+ sentences = sentences[:MAX_SENTENCES]
609
+
610
+ oral_count = 0
611
+ literate_count = 0
612
+ oral_weighted = 0.0
613
+ literate_weighted = 0.0
614
+ sentence_results = []
615
+
616
+ for sent in sentences:
617
+ word_count = len(sent.split())
618
+ if word_count < 3 or word_count > MAX_WORDS_PER_SENTENCE:
619
+ continue
620
+
621
+ category, cat_confidence = predict_category(sent)
622
+ markers = predict_subtype(sent)
623
+
624
+ if category == 'oral':
625
+ oral_count += 1
626
+ oral_weighted += cat_confidence
627
+ else:
628
+ literate_count += 1
629
+ literate_weighted += cat_confidence
630
+
631
+ sentence_results.append({
632
+ 'sentence': sent[:200] + '...' if len(sent) > 200 else sent,
633
+ 'category': category,
634
+ 'cat_confidence': cat_confidence,
635
+ 'subtype': markers[0]['marker'],
636
+ 'confidence': markers[0]['confidence']
637
+ })
638
+
639
+ total = oral_count + literate_count
640
+ total_weighted = oral_weighted + literate_weighted
641
+ sentence_ratio = oral_weighted / total_weighted if total_weighted > 0 else 0.5
642
+ ensemble_score = (DOC_MODEL_WEIGHT * doc_score) + (SENTENCE_WEIGHT * sentence_ratio)
643
+
644
+ if ensemble_score >= 0.65:
645
+ mode = "ORAL"
646
+ mode_desc = "High oral characteristics - repetition, direct address, concrete imagery"
647
+ color = "#228B22"
648
+ elif ensemble_score >= 0.35:
649
+ mode = "MIXED"
650
+ mode_desc = "Mixed oral and literate characteristics"
651
+ color = "#B8860B"
652
+ else:
653
+ mode = "LITERATE"
654
+ mode_desc = "High literate characteristics - abstraction, subordination, hedging"
655
+ color = "#4169E1"
656
+
657
+ bar_len = int(ensemble_score * 30)
658
+ bar = "\u2588" * bar_len + "\u2591" * (30 - bar_len)
659
+
660
+ warnings_html = ""
661
+ if lang_warning:
662
+ warnings_html += f"""
663
+ <div style="background: #f8d7da; border: 1px solid #f5c6cb; color: #721c24; padding: 12px; border-radius: 8px; margin-bottom: 15px;">
664
+ <strong>Warning:</strong> {lang_warning}
665
+ </div>
666
+ """
667
+ if truncated:
668
+ warnings_html += f"""
669
+ <div style="background: #fff3cd; border: 1px solid #ffc107; color: #856404; padding: 12px; border-radius: 8px; margin-bottom: 15px;">
670
+ <strong>Note:</strong> This text has {total_sentence_count} sentences. Only the first {MAX_SENTENCES} were analyzed.
671
+ </div>
672
+ """
673
+
674
+ score_html = f"""
675
+ <div style="font-family: system-ui; padding: 20px; background: #f8f9fa; border-radius: 10px;">
676
+ {warnings_html}
677
+ <h2 style="color: {color}; margin-bottom: 10px;">Orality Score: {ensemble_score:.2f} ({mode})</h2>
678
+ <p style="color: #666; margin-bottom: 15px;">{mode_desc}</p>
679
+ <div style="font-family: monospace; font-size: 14px; margin-bottom: 15px;">
680
+ <span style="color: #4169E1;">Literate</span> [{bar}] <span style="color: #228B22;">Oral</span>
681
+ </div>
682
+ <div style="font-size: 13px; color: #555;">
683
+ <strong>Score Components:</strong><br>
684
+ - Sentence analysis: {oral_count}/{total} oral ({sentence_ratio:.0%}) [confidence-weighted]<br>
685
+ - Document model: {doc_score:.2f}<br>
686
+ - Ensemble weights: {int(SENTENCE_WEIGHT*100)}% sentence + {int(DOC_MODEL_WEIGHT*100)}% document
687
+ </div>
688
+ </div>
689
+ """
690
+
691
+ if sentence_results:
692
+ rows = ""
693
+ for r in sentence_results[:20]:
694
+ cat_color = "#228B22" if r['category'] == 'oral' else "#4169E1"
695
+ rows += f"""
696
+ <tr>
697
+ <td style="color: {cat_color}; font-weight: bold; padding: 8px;">{r['category'].upper()}</td>
698
+ <td style="padding: 8px;">{r['subtype']} ({r['confidence']:.0%})</td>
699
+ <td style="padding: 8px; font-style: italic;">"{r['sentence']}"</td>
700
+ </tr>
701
+ """
702
+
703
+ sentences_html = f"""
704
+ <div style="font-family: system-ui; padding: 20px; background: #f8f9fa; border-radius: 10px; margin-top: 20px;">
705
+ <h3 style="margin-bottom: 15px;">Sentence-Level Analysis</h3>
706
+ <table style="width: 100%; border-collapse: collapse; font-size: 13px;">
707
+ <tr style="background: #e9ecef;">
708
+ <th style="padding: 8px; text-align: left; width: 80px;">Category</th>
709
+ <th style="padding: 8px; text-align: left; width: 150px;">Marker Type</th>
710
+ <th style="padding: 8px; text-align: left;">Sentence</th>
711
+ </tr>
712
+ {rows}
713
+ </table>
714
+ {f'<p style="color: #666; margin-top: 10px; font-size: 12px;">...and {len(sentence_results) - 20} more sentences analyzed</p>' if len(sentence_results) > 20 else ''}
715
+ </div>
716
+ """
717
+ else:
718
+ sentences_html = ""
719
+
720
+ reference_html = """
721
+ <div style="font-family: system-ui; padding: 20px; background: #fff3cd; border-radius: 10px; margin-top: 20px;">
722
+ <h4 style="margin-bottom: 10px;">Reference: Orality Scores by Genre</h4>
723
+ <table style="font-size: 12px; width: 100%;">
724
+ <tr><td><strong>0.9+</strong></td><td>Epic poetry, hip-hop, spoken word</td></tr>
725
+ <tr><td><strong>0.7-0.9</strong></td><td>Speeches, sermons, podcasts</td></tr>
726
+ <tr><td><strong>0.4-0.7</strong></td><td>Essays, blogs, casual writing</td></tr>
727
+ <tr><td><strong>0.1-0.4</strong></td><td>Journalism, technical writing</td></tr>
728
+ <tr><td><strong><0.1</strong></td><td>Academic papers, legal documents, philosophy</td></tr>
729
+ </table>
730
+ </div>
731
+ """
732
+
733
+ return score_html, sentences_html, reference_html
734
+
735
+
736
+ # Example texts
737
+ examples = [
738
+ ["Tell me, O Muse, of that ingenious hero who travelled far and wide after he had sacked the famous town of Troy. Many cities did he visit, and many were the nations with whose manners and customs he was acquainted."],
739
+ ["We will fight on the beaches, we will fight on the landing grounds, we will fight in the fields and in the streets, we will fight in the hills; we will never surrender."],
740
+ ["The analysis of variance revealed a statistically significant effect of treatment condition on participant response latency, F(2, 147) = 4.23, p < .05, suggesting that the experimental manipulation influenced cognitive processing speed."],
741
+ ["So like, I was just thinking about this the other day, right? And it's crazy because we never really talk about how much social media has changed everything. You know what I mean?"],
742
+ ]
743
+
744
+ # Build interface
745
+ with gr.Blocks(title="Havelock.AI - Orality Analyzer (Substring)", theme=gr.themes.Soft()) as demo:
746
+ gr.Markdown("""
747
+ # Havelock.AI - Orality Analyzer (Substring Staging)
748
+
749
+ Analyze text for **oral vs literate characteristics** based on Walter Ong's linguistic framework.
750
+
751
+ **v1.7**: Token-level span predictions for Tier 1 markers (F1 >= 0.50).
752
+ Use the API endpoint for span data — the Gradio UI shows sentence-level results only.
753
+ """)
754
+
755
+ with gr.Row():
756
+ with gr.Column(scale=1):
757
+ text_input = gr.Textbox(
758
+ label="Enter text to analyze",
759
+ placeholder="Paste your text here...",
760
+ lines=8
761
+ )
762
+ analyze_btn = gr.Button("Analyze", variant="primary")
763
+ gr.Examples(examples, inputs=text_input, label="Try these examples")
764
+
765
+ with gr.Column(scale=1):
766
+ score_output = gr.HTML(label="Orality Score")
767
+
768
+ sentences_output = gr.HTML(label="Sentence Analysis")
769
+ reference_output = gr.HTML(label="Reference")
770
+
771
+ analyze_btn.click(
772
+ fn=analyze_text,
773
+ inputs=text_input,
774
+ outputs=[score_output, sentences_output, reference_output]
775
+ )
776
+
777
+ # Hidden API interface for website integration
778
+ api_input = gr.Textbox(visible=False)
779
+ api_output = gr.JSON(visible=False)
780
+ api_btn = gr.Button(visible=False)
781
+ api_btn.click(fn=analyze_api, inputs=api_input, outputs=api_output, api_name="analyze")
782
+
783
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ huggingface_hub
4
+ langdetect
5
+ safetensors