""" LexiMind - Book & Paper Discovery Browse books and research papers by topic or emotion. Pre-analyzed summaries help you find what to read next. Author: Oliver Perrin Date: 2026-01-14 """ from __future__ import annotations import json from pathlib import Path from typing import Any import gradio as gr from datasets import Dataset, load_dataset # --------------- Load Dataset from HuggingFace Hub --------------- print("Loading discovery dataset from HuggingFace Hub...") _dataset: Dataset = load_dataset("OliverPerrin/LexiMind-Discovery", split="train") # type: ignore[assignment] print(f"Loaded {len(_dataset)} items") # Convert to list of dicts for easier filtering ALL_ITEMS: list[dict[str, Any]] = [dict(row) for row in _dataset] # Extract unique topics and emotions FROM THE DATASET (what model predicted) DATASET_TOPICS: list[str] = sorted(set(str(item["topic"]) for item in ALL_ITEMS if item.get("topic"))) DATASET_EMOTIONS: list[str] = sorted(set(str(item["emotion"]) for item in ALL_ITEMS if item.get("emotion"))) # Load ALL possible labels from labels.json (what the model CAN predict) _labels_path = Path(__file__).parent.parent / "artifacts" / "labels.json" if _labels_path.exists(): with open(_labels_path) as f: _labels = json.load(f) ALL_TOPICS: list[str] = _labels.get("topic", DATASET_TOPICS) ALL_EMOTIONS: list[str] = _labels.get("emotion", DATASET_EMOTIONS) else: ALL_TOPICS = DATASET_TOPICS ALL_EMOTIONS = DATASET_EMOTIONS # Use dataset-observed values for dropdown filtering TOPICS = DATASET_TOPICS EMOTIONS = DATASET_EMOTIONS # Group by source type BOOKS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "literary"] PAPERS: list[dict[str, Any]] = [item for item in ALL_ITEMS if item.get("source_type") == "academic"] print(f"Dataset Topics ({len(TOPICS)}): {TOPICS}") print(f"Dataset Emotions ({len(EMOTIONS)}): {EMOTIONS}") print(f"All Model Topics ({len(ALL_TOPICS)}): {ALL_TOPICS}") print(f"All Model Emotions ({len(ALL_EMOTIONS)}): {ALL_EMOTIONS}") print(f"Books: {len(BOOKS)}, Papers: {len(PAPERS)}") # --------------- Load Evaluation Metrics --------------- METRICS: dict[str, Any] = {} _metrics_path = Path(__file__).parent.parent / "outputs" / "evaluation_report.json" if _metrics_path.exists(): try: with open(_metrics_path) as f: METRICS = json.load(f) print(f"Loaded evaluation metrics from {_metrics_path}") except Exception as e: print(f"Warning: Could not load metrics: {e}") # --------------- Filter Functions --------------- def get_items_by_topic(topic: str) -> list[dict]: """Get all items matching a topic.""" if topic == "All": return ALL_ITEMS return [item for item in ALL_ITEMS if item.get("topic") == topic] def get_items_by_emotion(emotion: str) -> list[dict]: """Get all items matching an emotion.""" if emotion == "All": return ALL_ITEMS return [item for item in ALL_ITEMS if item.get("emotion") == emotion] def format_item_card(item: dict) -> str: """Format an item as a markdown card.""" title = item.get("title", "Unknown") source_type = item.get("source_type", "unknown") dataset_name = item.get("dataset", "").title() # Icon based on type if source_type == "academic": icon = "📄" type_label = "Research Paper" else: icon = "📖" type_label = "Literature" # Topic and emotion with confidence topic = item.get("topic", "Unknown") topic_conf = item.get("topic_confidence", 0) emotion = item.get("emotion", "Unknown") emotion_conf = item.get("emotion_confidence", 0) # Summary - check if using reference or generated use_reference = item.get("use_reference_summary", False) if use_reference or source_type == "literary": summary = item.get("reference_summary", "") summary_label = "📚 **Book Description** (Goodreads-style):" else: summary = item.get("generated_summary", "") summary_label = "🤖 **AI-Generated Description:**" if not summary: summary = "No summary available." # Truncate summary if too long if len(summary) > 400: summary = summary[:400].rsplit(' ', 1)[0] + "..." # Preview of original text text_preview = item.get("text", "")[:400] + "..." if len(item.get("text", "")) > 400 else item.get("text", "") # Confidence badges topic_badge = "đŸŸĸ" if topic_conf > 0.6 else "🟡" if topic_conf > 0.3 else "🔴" emotion_badge = "đŸŸĸ" if emotion_conf > 0.6 else "🟡" if emotion_conf > 0.3 else "🔴" return f"""### {icon} **{title}** *{type_label}* from {dataset_name} | Topic | Emotion | |-------|---------| | {topic_badge} {topic} ({topic_conf:.0%}) | {emotion_badge} {emotion.title()} ({emotion_conf:.0%}) | {summary_label} > {summary}
📜 View Original Text {text_preview}
--- """ def browse_by_topic(topic: str) -> str: """Browse items filtered by topic.""" items = get_items_by_topic(topic) if not items: return "No items found for this topic." # Group by type literary = [i for i in items if i.get("source_type") == "literary"] academic = [i for i in items if i.get("source_type") == "academic"] result = f"## {topic if topic != 'All' else 'All Topics'}\n\n" result += f"*Found {len(items)} items ({len(literary)} literary, {len(academic)} academic)*\n\n" if literary: result += "### 📖 Literary Works\n\n" for item in literary[:25]: # Limit to avoid huge pages result += format_item_card(item) if academic: result += "### 📄 Academic Papers\n\n" for item in academic[:25]: result += format_item_card(item) return result def browse_by_emotion(emotion: str) -> str: """Browse items filtered by emotion.""" items = get_items_by_emotion(emotion) if not items: return "No items found for this emotion." literary = [i for i in items if i.get("source_type") == "literary"] academic = [i for i in items if i.get("source_type") == "academic"] result = f"## Feeling {emotion.title() if emotion != 'All' else 'All Emotions'}?\n\n" result += f"*Found {len(items)} items ({len(literary)} literary, {len(academic)} academic)*\n\n" if literary: result += "### 📖 Literary Works\n\n" for item in literary[:25]: result += format_item_card(item) if academic: result += "### 📄 Academic Papers\n\n" for item in academic[:25]: result += format_item_card(item) return result def search_items(query: str) -> str: """Search items by text content.""" if not query or len(query) < 3: return "Enter at least 3 characters to search." query_lower = query.lower() matches = [ item for item in ALL_ITEMS if query_lower in item.get("text", "").lower() or query_lower in item.get("generated_summary", "").lower() or query_lower in item.get("title", "").lower() ] if not matches: return f"No results found for '{query}'." result = f"## Search Results for '{query}'\n\n" result += f"*Found {len(matches)} matching items*\n\n" for item in matches[:30]: result += format_item_card(item) return result # --------------- Gradio Interface --------------- with gr.Blocks( title="LexiMind", theme=gr.themes.Soft(), css=""" .result-box { max-height: 700px; overflow-y: auto; } h3 { margin-top: 0.5em !important; } """ ) as demo: gr.Markdown( """ # 📚 LexiMind - Literary Discovery ### Find Books & Research Papers by Topic or Emotional Tone Explore **{total_count}** items analyzed by the LexiMind multi-task transformer: | Source | Count | Description | |--------|-------|-------------| | 📖 Literature | {lit_count} | Classic books with Goodreads-style descriptions | | 📄 Research | {paper_count} | Scientific papers from arXiv | **Model Capabilities:** - đŸˇī¸ **Topic Classification**: Fiction, Science, History, Philosophy, Arts, Business, Technology - 💭 **Emotion Detection**: 28 emotions (joy, sadness, anger, fear, surprise, love, etc.) - 📝 **Book Descriptions**: Back-cover style summaries of what texts are about --- """.format( total_count=len(ALL_ITEMS), lit_count=len(BOOKS), paper_count=len(PAPERS) ) ) with gr.Tabs(): # ===================== TAB 1: BROWSE BY TOPIC ===================== with gr.Tab("đŸˇī¸ Browse by Topic"): gr.Markdown("*Select a topic to explore related books and papers*") topic_dropdown = gr.Dropdown( choices=["All"] + TOPICS, value="All", label="Select Topic", interactive=True, ) topic_results = gr.Markdown( value=browse_by_topic("All"), elem_classes=["result-box"], ) topic_dropdown.change( fn=browse_by_topic, inputs=[topic_dropdown], outputs=[topic_results], ) # ===================== TAB 2: BROWSE BY EMOTION ===================== with gr.Tab("💭 Browse by Emotion"): gr.Markdown("*Find books and papers that evoke specific emotions*") emotion_dropdown = gr.Dropdown( choices=["All"] + [e.title() for e in EMOTIONS], value="All", label="Select Emotion", interactive=True, ) emotion_results = gr.Markdown( value=browse_by_emotion("All"), elem_classes=["result-box"], ) emotion_dropdown.change( fn=lambda e: browse_by_emotion(e.lower() if e != "All" else "All"), inputs=[emotion_dropdown], outputs=[emotion_results], ) # ===================== TAB 3: SEARCH ===================== with gr.Tab("🔍 Search"): gr.Markdown("*Search through all books and papers by keyword*") search_input = gr.Textbox( placeholder="Enter keywords to search...", label="Search", interactive=True, ) search_results = gr.Markdown( value="Enter at least 3 characters to search.", elem_classes=["result-box"], ) search_input.change( fn=search_items, inputs=[search_input], outputs=[search_results], ) # ===================== TAB 4: METRICS ===================== with gr.Tab("📊 Model Metrics"): gr.Markdown( """ ### Evaluation Metrics LexiMind is evaluated using comprehensive metrics across all three tasks. Metrics are computed on held-out validation data. """ ) # Summarization Metrics gr.Markdown("#### 📝 Summarization Metrics") if METRICS.get("summarization"): summ = METRICS["summarization"] summ_md = """ | Metric | Score | Description | |--------|-------|-------------| | **ROUGE-1** | {rouge1:.4f} | Unigram overlap with reference | | **ROUGE-2** | {rouge2:.4f} | Bigram overlap with reference | | **ROUGE-L** | {rougeL:.4f} | Longest common subsequence | | **BLEU-4** | {bleu4:.4f} | 4-gram precision score | | **BERTScore F1** | {bertscore:.4f} | Semantic similarity (contextual) | *Note: For back-cover style descriptions, BERTScore is more meaningful than ROUGE since descriptions paraphrase rather than quote the source text.* """.format( rouge1=summ.get("rouge_rouge1", summ.get("rouge1", 0)), rouge2=summ.get("rouge_rouge2", summ.get("rouge2", 0)), rougeL=summ.get("rouge_rougeL", summ.get("rougeL", 0)), bleu4=summ.get("bleu4", 0), bertscore=summ.get("bertscore_f1", 0), ) gr.Markdown(summ_md) else: gr.Markdown("*Summarization metrics not available. Run evaluation script.*") # Topic Classification Metrics gr.Markdown("#### đŸˇī¸ Topic Classification Metrics") if METRICS.get("topic"): topic = METRICS["topic"] topic_md = """ | Metric | Score | |--------|-------| | **Accuracy** | {accuracy:.2%} | | **Macro F1** | {f1:.4f} | | **Precision** | {precision:.4f} | | **Recall** | {recall:.4f} | """.format( accuracy=topic.get("accuracy", 0), f1=topic.get("f1", topic.get("macro_f1", 0)), precision=topic.get("precision", 0), recall=topic.get("recall", 0), ) gr.Markdown(topic_md) else: gr.Markdown("*Topic classification metrics not available.*") # Emotion Detection Metrics gr.Markdown("#### 💭 Emotion Detection Metrics") if METRICS.get("emotion"): emotion = METRICS["emotion"] emotion_md = """ | Metric | Score | |--------|-------| | **Multi-label F1** | {f1:.4f} | | **Precision** | {precision:.4f} | | **Recall** | {recall:.4f} | *Emotion detection uses 28 labels from GoEmotions. Multiple emotions can be assigned to each text.* """.format( f1=emotion.get("f1", emotion.get("multilabel_f1", 0)), precision=emotion.get("precision", 0), recall=emotion.get("recall", 0), ) gr.Markdown(emotion_md) else: gr.Markdown("*Emotion detection metrics not available.*") # Dataset Statistics gr.Markdown("#### 📈 Dataset & Model Statistics") # Build topic list with indicators for observed vs possible topic_list = ", ".join([ f"**{t}**" if t in TOPICS else t for t in ALL_TOPICS ]) emotion_list = ", ".join([ f"**{e}**" if e in EMOTIONS else e for e in ALL_EMOTIONS ]) gr.Markdown(f""" | Statistic | Value | |-----------|-------| | Total Discovery Items | {len(ALL_ITEMS)} | | Literary Works | {len(BOOKS)} | | Academic Papers (arXiv) | {len(PAPERS)} | | Topics in Dataset | {len(TOPICS)} of {len(ALL_TOPICS)} possible | | Emotions in Dataset | {len(EMOTIONS)} of {len(ALL_EMOTIONS)} possible | **All Model Topics ({len(ALL_TOPICS)}):** {topic_list} **All Model Emotions ({len(ALL_EMOTIONS)}):** {emotion_list} *Bold items appear in the discovery dataset. The model can predict all listed labels.* --- **Note on Content Types:** - 📄 **Academic Papers** include CS/AI papers (Technology), Physics/Math (Science), Economics (Business) - 📖 **Literary Works** include novels (Fiction), biographies (History), philosophical texts (Philosophy) - Technical blogs and tutorials would be classified under **Technology** """) # ===================== TAB 5: ABOUT ===================== with gr.Tab("â„šī¸ About"): gr.Markdown( """ ### About LexiMind LexiMind is a **272M parameter encoder-decoder transformer** trained on three tasks: | Task | Description | |------|-------------| | **Book Descriptions** | Generate back-cover style descriptions of what books are about | | **Topic Classification** | Categorize into Fiction, Science, Technology, Philosophy, History, Business, Arts | | **Emotion Detection** | Identify emotional tones (28 emotions from GoEmotions) | ### Architecture - **Base:** FLAN-T5-base (Google) - **Encoder:** 12 layers, 768 dim, 12 attention heads - **Decoder:** 12 layers with causal attention - **Position:** T5 relative position bias - **Training:** Multi-task learning with task-specific heads ### Training Data | Dataset | Task | Description | |---------|------|-------------| | Goodreads (711k+ blurbs) | Book Descriptions | Back-cover style descriptions matched with Gutenberg texts | | arXiv | Paper Abstracts | Scientific paper summarization | | 20 Newsgroups + Gutenberg | Topic Classification | Multi-domain topic categorization | | GoEmotions | Emotion Detection | 28-class multi-label emotion classification | ### Key Design Decision LexiMind generates **back-cover style descriptions** (what a book is about) rather than plot summaries (what happens in the book). This is achieved by training on Goodreads descriptions paired with Project Gutenberg book texts. ### Evaluation Metrics - **ROUGE-1/2/L**: Lexical overlap (expected range: 0.15-0.25 for descriptions) - **BLEU-4**: N-gram precision - **BERTScore**: Semantic similarity using contextual embeddings (key metric for paraphrasing) ### Links - 🔗 [GitHub](https://github.com/OliverPerrin/LexiMind) - 🤗 [Model](https://huggingface.co/OliverPerrin/LexiMind-Model) - 📊 [Discovery Dataset](https://huggingface.co/datasets/OliverPerrin/LexiMind-Discovery) --- *Built by Oliver Perrin â€ĸ Appalachian State University â€ĸ 2025-2026* """ ) # --------------- Entry Point --------------- if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)