File size: 18,707 Bytes
83ba93c
061c1e7
83ba93c
061c1e7
 
83ba93c
 
218e2b1
83ba93c
29f2de2
b5ddd7b
 
f1ab267
 
6d135aa
 
fd09961
6d135aa
fd09961
218e2b1
83ba93c
218e2b1
6d135aa
 
a484623
6d135aa
 
a484623
18e0afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83ba93c
218e2b1
6d135aa
 
b5ddd7b
18e0afe
 
 
 
218e2b1
c8c20f1
f1ab267
 
 
 
 
 
 
 
 
 
 
 
83ba93c
061c1e7
83ba93c
 
061c1e7
 
 
 
 
83ba93c
 
061c1e7
218e2b1
061c1e7
 
218e2b1
b5ddd7b
cd865e2
061c1e7
 
218e2b1
 
 
 
 
0fe274c
 
 
 
 
 
218e2b1
 
 
 
 
 
061c1e7
f1cb860
 
 
 
f1ab267
f1cb860
 
f1ab267
f1cb860
218e2b1
f1cb860
061c1e7
0fe274c
 
 
 
218e2b1
0fe274c
 
 
 
 
061c1e7
0fe274c
218e2b1
0fe274c
218e2b1
0fe274c
 
 
218e2b1
f1cb860
218e2b1
b5ddd7b
218e2b1
0fe274c
fd09961
218e2b1
fd09961
218e2b1
45089eb
061c1e7
 
45089eb
076bc18
061c1e7
 
 
 
 
 
 
218e2b1
 
061c1e7
 
218e2b1
061c1e7
218e2b1
 
 
061c1e7
 
218e2b1
 
 
061c1e7
 
 
076bc18
 
061c1e7
 
 
 
 
 
218e2b1
 
061c1e7
 
218e2b1
061c1e7
218e2b1
 
 
061c1e7
 
218e2b1
 
 
061c1e7
 
 
b5ddd7b
40ccedf
218e2b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83ba93c
8860b97
061c1e7
 
 
 
0fe274c
 
061c1e7
 
13f7ac3
8ae954c
40ccedf
0fe274c
 
076bc18
0fe274c
218e2b1
d57b866
 
f1ab267
d57b866
0fe274c
 
d57b866
 
f1ab267
4bda87e
061c1e7
0fe274c
 
 
 
 
8ae954c
061c1e7
13f7ac3
061c1e7
 
 
13f7ac3
061c1e7
 
 
 
 
 
13f7ac3
061c1e7
 
 
 
4bda87e
061c1e7
 
 
 
 
 
 
 
 
4bda87e
061c1e7
 
 
 
 
 
13f7ac3
061c1e7
 
 
 
fc64ea0
061c1e7
 
 
 
 
 
218e2b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1ab267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18e0afe
 
 
 
 
 
 
 
 
 
f1ab267
 
 
18e0afe
f1ab267
18e0afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1ab267
 
 
061c1e7
13f7ac3
 
061c1e7
ebf2964
061c1e7
fc64ea0
061c1e7
 
f1ab267
061c1e7
 
fc64ea0
061c1e7
 
 
 
 
 
 
fc64ea0
13f7ac3
fc64ea0
8f5fea2
 
 
 
 
 
f1ab267
 
fc64ea0
f1ab267
 
 
218e2b1
f1ab267
218e2b1
8f5fea2
f1ab267
8f5fea2
218e2b1
13f7ac3
 
061c1e7
 
218e2b1
13f7ac3
 
061c1e7
13f7ac3
 
8860b97
83ba93c
 
 
8860b97
8ae954c
6f4d4de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
"""
LexiMind - Book & Paper Discovery

Browse books and research papers by topic or emotion.
Pre-analyzed summaries help you find what to read next.

Author: Oliver Perrin
Date: 2026-01-14
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import gradio as gr
from datasets import Dataset, load_dataset

# --------------- Load Dataset from HuggingFace Hub ---------------

print("Loading discovery dataset from HuggingFace Hub...")
_dataset: Dataset = load_dataset("OliverPerrin/LexiMind-Discovery", split="train")  # type: ignore[assignment]
print(f"Loaded {len(_dataset)} items")

# Convert to list of dicts for easier filtering
ALL_ITEMS: list[dict[str, Any]] = [dict(row) for row in _dataset]

# Extract unique topics and emotions FROM THE DATASET (what model predicted)
DATASET_TOPICS: list[str] = sorted(set(str(item["topic"]) for item in ALL_ITEMS if item.get("topic")))
DATASET_EMOTIONS: list[str] = sorted(set(str(item["emotion"]) for item in ALL_ITEMS if item.get("emotion")))

# Load ALL possible labels from labels.json (what the model CAN predict)
_labels_path = Path(__file__).parent.parent / "artifacts" / "labels.json"
if _labels_path.exists():
    with open(_labels_path) as f:
        _labels = json.load(f)
    ALL_TOPICS: list[str] = _labels.get("topic", DATASET_TOPICS)
    ALL_EMOTIONS: list[str] = _labels.get("emotion", DATASET_EMOTIONS)
else:
    ALL_TOPICS = DATASET_TOPICS
    ALL_EMOTIONS = DATASET_EMOTIONS

# Use dataset-observed values for dropdown filtering
TOPICS = DATASET_TOPICS
EMOTIONS = DATASET_EMOTIONS

# Group by source type
BOOKS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "literary"]
PAPERS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "academic"]

print(f"Dataset Topics ({len(TOPICS)}): {TOPICS}")
print(f"Dataset Emotions ({len(EMOTIONS)}): {EMOTIONS}")
print(f"All Model Topics ({len(ALL_TOPICS)}): {ALL_TOPICS}")
print(f"All Model Emotions ({len(ALL_EMOTIONS)}): {ALL_EMOTIONS}")
print(f"Books: {len(BOOKS)}, Papers: {len(PAPERS)}")

# --------------- Load Evaluation Metrics ---------------

METRICS: dict[str, Any] = {}
_metrics_path = Path(__file__).parent.parent / "outputs" / "evaluation_report.json"
if _metrics_path.exists():
    try:
        with open(_metrics_path) as f:
            METRICS = json.load(f)
        print(f"Loaded evaluation metrics from {_metrics_path}")
    except Exception as e:
        print(f"Warning: Could not load metrics: {e}")


# --------------- Filter Functions ---------------


def get_items_by_topic(topic: str) -> list[dict]:
    """Get all items matching a topic."""
    if topic == "All":
        return ALL_ITEMS
    return [item for item in ALL_ITEMS if item.get("topic") == topic]


def get_items_by_emotion(emotion: str) -> list[dict]:
    """Get all items matching an emotion."""
    if emotion == "All":
        return ALL_ITEMS
    return [item for item in ALL_ITEMS if item.get("emotion") == emotion]


def format_item_card(item: dict) -> str:
    """Format an item as a markdown card."""
    title = item.get("title", "Unknown")
    source_type = item.get("source_type", "unknown")
    dataset_name = item.get("dataset", "").title()
    
    # Icon based on type
    if source_type == "academic":
        icon = "πŸ“„"
        type_label = "Research Paper"
    else:
        icon = "πŸ“–"
        type_label = "Literature"
    
    # Topic and emotion with confidence
    topic = item.get("topic", "Unknown")
    topic_conf = item.get("topic_confidence", 0)
    emotion = item.get("emotion", "Unknown")
    emotion_conf = item.get("emotion_confidence", 0)
    
    # Summary - check if using reference or generated
    use_reference = item.get("use_reference_summary", False)
    if use_reference or source_type == "literary":
        summary = item.get("reference_summary", "")
        summary_label = "πŸ“š **Book Description** (Goodreads-style):"
    else:
        summary = item.get("generated_summary", "")
        summary_label = "πŸ€– **AI-Generated Description:**"
    
    if not summary:
        summary = "No summary available."
    
    # Truncate summary if too long
    if len(summary) > 400:
        summary = summary[:400].rsplit(' ', 1)[0] + "..."
    
    # Preview of original text
    text_preview = item.get("text", "")[:400] + "..." if len(item.get("text", "")) > 400 else item.get("text", "")
    
    # Confidence badges
    topic_badge = "🟒" if topic_conf > 0.6 else "🟑" if topic_conf > 0.3 else "πŸ”΄"
    emotion_badge = "🟒" if emotion_conf > 0.6 else "🟑" if emotion_conf > 0.3 else "πŸ”΄"
    
    return f"""### {icon} **{title}**

<small>*{type_label}* from {dataset_name}</small>

| Topic | Emotion |
|-------|---------|
| {topic_badge} {topic} ({topic_conf:.0%}) | {emotion_badge} {emotion.title()} ({emotion_conf:.0%}) |

{summary_label}
> {summary}

<details>
<summary>πŸ“œ View Original Text</summary>

{text_preview}

</details>

---
"""


def browse_by_topic(topic: str) -> str:
    """Browse items filtered by topic."""
    items = get_items_by_topic(topic)
    if not items:
        return "No items found for this topic."
    
    # Group by type
    literary = [i for i in items if i.get("source_type") == "literary"]
    academic = [i for i in items if i.get("source_type") == "academic"]
    
    result = f"## {topic if topic != 'All' else 'All Topics'}\n\n"
    result += f"*Found {len(items)} items ({len(literary)} literary, {len(academic)} academic)*\n\n"
    
    if literary:
        result += "### πŸ“– Literary Works\n\n"
        for item in literary[:25]:  # Limit to avoid huge pages
            result += format_item_card(item)
    
    if academic:
        result += "### πŸ“„ Academic Papers\n\n"
        for item in academic[:25]:
            result += format_item_card(item)
    
    return result


def browse_by_emotion(emotion: str) -> str:
    """Browse items filtered by emotion."""
    items = get_items_by_emotion(emotion)
    if not items:
        return "No items found for this emotion."
    
    literary = [i for i in items if i.get("source_type") == "literary"]
    academic = [i for i in items if i.get("source_type") == "academic"]
    
    result = f"## Feeling {emotion.title() if emotion != 'All' else 'All Emotions'}?\n\n"
    result += f"*Found {len(items)} items ({len(literary)} literary, {len(academic)} academic)*\n\n"
    
    if literary:
        result += "### πŸ“– Literary Works\n\n"
        for item in literary[:25]:
            result += format_item_card(item)
    
    if academic:
        result += "### πŸ“„ Academic Papers\n\n"
        for item in academic[:25]:
            result += format_item_card(item)
    
    return result


def search_items(query: str) -> str:
    """Search items by text content."""
    if not query or len(query) < 3:
        return "Enter at least 3 characters to search."
    
    query_lower = query.lower()
    matches = [
        item for item in ALL_ITEMS
        if query_lower in item.get("text", "").lower()
        or query_lower in item.get("generated_summary", "").lower()
        or query_lower in item.get("title", "").lower()
    ]
    
    if not matches:
        return f"No results found for '{query}'."
    
    result = f"## Search Results for '{query}'\n\n"
    result += f"*Found {len(matches)} matching items*\n\n"
    
    for item in matches[:30]:
        result += format_item_card(item)
    
    return result


# --------------- Gradio Interface ---------------

with gr.Blocks(
    title="LexiMind",
    theme=gr.themes.Soft(),
    css="""
    .result-box { max-height: 700px; overflow-y: auto; }
    h3 { margin-top: 0.5em !important; }
    """
) as demo:
    
    gr.Markdown(
        """
        # πŸ“š LexiMind - Literary Discovery
        ### Find Books & Research Papers by Topic or Emotional Tone
        
        Explore **{total_count}** items analyzed by the LexiMind multi-task transformer:
        
        | Source | Count | Description |
        |--------|-------|-------------|
        | πŸ“– Literature | {lit_count} | Classic books with Goodreads-style descriptions |
        | πŸ“„ Research | {paper_count} | Scientific papers from arXiv |
        
        **Model Capabilities:**
        - 🏷️ **Topic Classification**: Fiction, Science, History, Philosophy, Arts, Business, Technology
        - πŸ’­ **Emotion Detection**: 28 emotions (joy, sadness, anger, fear, surprise, love, etc.)
        - πŸ“ **Book Descriptions**: Back-cover style summaries of what texts are about
        
        ---
        """.format(
            total_count=len(ALL_ITEMS),
            lit_count=len(BOOKS),
            paper_count=len(PAPERS)
        )
    )
    
    with gr.Tabs():
        # ===================== TAB 1: BROWSE BY TOPIC =====================
        with gr.Tab("🏷️ Browse by Topic"):
            gr.Markdown("*Select a topic to explore related books and papers*")
            
            topic_dropdown = gr.Dropdown(
                choices=["All"] + TOPICS,
                value="All",
                label="Select Topic",
                interactive=True,
            )
            
            topic_results = gr.Markdown(
                value=browse_by_topic("All"),
                elem_classes=["result-box"],
            )
            
            topic_dropdown.change(
                fn=browse_by_topic,
                inputs=[topic_dropdown],
                outputs=[topic_results],
            )
        
        # ===================== TAB 2: BROWSE BY EMOTION =====================
        with gr.Tab("πŸ’­ Browse by Emotion"):
            gr.Markdown("*Find books and papers that evoke specific emotions*")
            
            emotion_dropdown = gr.Dropdown(
                choices=["All"] + [e.title() for e in EMOTIONS],
                value="All",
                label="Select Emotion",
                interactive=True,
            )
            
            emotion_results = gr.Markdown(
                value=browse_by_emotion("All"),
                elem_classes=["result-box"],
            )
            
            emotion_dropdown.change(
                fn=lambda e: browse_by_emotion(e.lower() if e != "All" else "All"),
                inputs=[emotion_dropdown],
                outputs=[emotion_results],
            )
        
        # ===================== TAB 3: SEARCH =====================
        with gr.Tab("πŸ” Search"):
            gr.Markdown("*Search through all books and papers by keyword*")
            
            search_input = gr.Textbox(
                placeholder="Enter keywords to search...",
                label="Search",
                interactive=True,
            )
            
            search_results = gr.Markdown(
                value="Enter at least 3 characters to search.",
                elem_classes=["result-box"],
            )
            
            search_input.change(
                fn=search_items,
                inputs=[search_input],
                outputs=[search_results],
            )
        
        # ===================== TAB 4: METRICS =====================
        with gr.Tab("πŸ“Š Model Metrics"):
            gr.Markdown(
                """
                ### Evaluation Metrics
                
                LexiMind is evaluated using comprehensive metrics across all three tasks.
                Metrics are computed on held-out validation data.
                """
            )
            
            # Summarization Metrics
            gr.Markdown("#### πŸ“ Summarization Metrics")
            
            if METRICS.get("summarization"):
                summ = METRICS["summarization"]
                summ_md = """
| Metric | Score | Description |
|--------|-------|-------------|
| **ROUGE-1** | {rouge1:.4f} | Unigram overlap with reference |
| **ROUGE-2** | {rouge2:.4f} | Bigram overlap with reference |
| **ROUGE-L** | {rougeL:.4f} | Longest common subsequence |
| **BLEU-4** | {bleu4:.4f} | 4-gram precision score |
| **BERTScore F1** | {bertscore:.4f} | Semantic similarity (contextual) |

*Note: For back-cover style descriptions, BERTScore is more meaningful than ROUGE 
since descriptions paraphrase rather than quote the source text.*
""".format(
                    rouge1=summ.get("rouge_rouge1", summ.get("rouge1", 0)),
                    rouge2=summ.get("rouge_rouge2", summ.get("rouge2", 0)),
                    rougeL=summ.get("rouge_rougeL", summ.get("rougeL", 0)),
                    bleu4=summ.get("bleu4", 0),
                    bertscore=summ.get("bertscore_f1", 0),
                )
                gr.Markdown(summ_md)
            else:
                gr.Markdown("*Summarization metrics not available. Run evaluation script.*")
            
            # Topic Classification Metrics
            gr.Markdown("#### 🏷️ Topic Classification Metrics")
            
            if METRICS.get("topic"):
                topic = METRICS["topic"]
                topic_md = """
| Metric | Score |
|--------|-------|
| **Accuracy** | {accuracy:.2%} |
| **Macro F1** | {f1:.4f} |
| **Precision** | {precision:.4f} |
| **Recall** | {recall:.4f} |
""".format(
                    accuracy=topic.get("accuracy", 0),
                    f1=topic.get("f1", topic.get("macro_f1", 0)),
                    precision=topic.get("precision", 0),
                    recall=topic.get("recall", 0),
                )
                gr.Markdown(topic_md)
            else:
                gr.Markdown("*Topic classification metrics not available.*")
            
            # Emotion Detection Metrics
            gr.Markdown("#### πŸ’­ Emotion Detection Metrics")
            
            if METRICS.get("emotion"):
                emotion = METRICS["emotion"]
                emotion_md = """
| Metric | Score |
|--------|-------|
| **Multi-label F1** | {f1:.4f} |
| **Precision** | {precision:.4f} |
| **Recall** | {recall:.4f} |

*Emotion detection uses 28 labels from GoEmotions. Multiple emotions can be assigned to each text.*
""".format(
                    f1=emotion.get("f1", emotion.get("multilabel_f1", 0)),
                    precision=emotion.get("precision", 0),
                    recall=emotion.get("recall", 0),
                )
                gr.Markdown(emotion_md)
            else:
                gr.Markdown("*Emotion detection metrics not available.*")
            
            # Dataset Statistics
            gr.Markdown("#### πŸ“ˆ Dataset & Model Statistics")
            
            # Build topic list with indicators for observed vs possible
            topic_list = ", ".join([
                f"**{t}**" if t in TOPICS else t for t in ALL_TOPICS
            ])
            emotion_list = ", ".join([
                f"**{e}**" if e in EMOTIONS else e for e in ALL_EMOTIONS
            ])
            
            gr.Markdown(f"""
| Statistic | Value |
|-----------|-------|
| Total Discovery Items | {len(ALL_ITEMS)} |
| Literary Works | {len(BOOKS)} |
| Academic Papers (arXiv) | {len(PAPERS)} |
| Topics in Dataset | {len(TOPICS)} of {len(ALL_TOPICS)} possible |
| Emotions in Dataset | {len(EMOTIONS)} of {len(ALL_EMOTIONS)} possible |

**All Model Topics ({len(ALL_TOPICS)}):** {topic_list}

**All Model Emotions ({len(ALL_EMOTIONS)}):** {emotion_list}

*Bold items appear in the discovery dataset. The model can predict all listed labels.*

---

**Note on Content Types:**
- πŸ“„ **Academic Papers** include CS/AI papers (Technology), Physics/Math (Science), Economics (Business)
- πŸ“– **Literary Works** include novels (Fiction), biographies (History), philosophical texts (Philosophy)
- Technical blogs and tutorials would be classified under **Technology**
""")
        
        # ===================== TAB 5: ABOUT =====================
        with gr.Tab("ℹ️ About"):
            gr.Markdown(
                """
                ### About LexiMind
                
                LexiMind is a **272M parameter encoder-decoder transformer** trained on three tasks:
                
                | Task | Description |
                |------|-------------|
                | **Book Descriptions** | Generate back-cover style descriptions of what books are about |
                | **Topic Classification** | Categorize into Fiction, Science, Technology, Philosophy, History, Business, Arts |
                | **Emotion Detection** | Identify emotional tones (28 emotions from GoEmotions) |
                
                ### Architecture
                
                - **Base:** FLAN-T5-base (Google)
                - **Encoder:** 12 layers, 768 dim, 12 attention heads
                - **Decoder:** 12 layers with causal attention
                - **Position:** T5 relative position bias
                - **Training:** Multi-task learning with task-specific heads
                
                ### Training Data
                
                | Dataset | Task | Samples |
                |---------|------|---------|
                | Gutenberg + Goodreads | Book Descriptions | ~4K literary pairs |
                | arXiv (body β†’ abstract) | Paper Abstracts | ~45K academic pairs |
                | 20 Newsgroups + Gutenberg + arXiv | Topic Classification | 3.4K (7 classes) |
                | GoEmotions (Reddit) | Emotion Detection | 43K (28 labels) |
                
                ### Key Design Decision
                
                LexiMind generates **back-cover style descriptions** (what a book is about) rather than 
                plot summaries (what happens in the book). This is achieved by training on Goodreads 
                descriptions paired with Project Gutenberg book texts.
                
                ### Evaluation Metrics
                
                - **ROUGE-1/2/L**: Lexical overlap with reference summaries
                - **BLEU-4**: N-gram precision
                - **BERTScore**: Semantic similarity using contextual embeddings (primary metric for abstractive summarization)
                
                ### Links
                
                - πŸ”— [GitHub](https://github.com/OliverPerrin/LexiMind)
                - πŸ€— [Model](https://huggingface.co/OliverPerrin/LexiMind-Model)
                - πŸ“Š [Discovery Dataset](https://huggingface.co/datasets/OliverPerrin/LexiMind-Discovery)
                
                ---
                *Built by Oliver Perrin β€’ Appalachian State University β€’ 2025-2026*
                """
            )


# --------------- Entry Point ---------------

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)