Spaces:

Shouryahere
/

infy

Running

File size: 25,312 Bytes

"""
HuggingFace Enabling Sessions - Gradio Interactive Demo App
Hosted on HuggingFace Spaces
"""

import gradio as gr
import config
import utils
import pandas as pd

try:
    import spaces
except Exception:
    class _SpacesFallback:
        @staticmethod
        def GPU(func=None, *args, **kwargs):
            if func is None:
                def decorator(inner_func):
                    return inner_func
                return decorator
            return func

    spaces = _SpacesFallback()

# ===================== UTILITIES =====================

def load_sample_texts():
    """Load sample texts from CSV."""
    try:
        df = pd.read_csv(config.SAMPLE_DATA_CSV)
        return df
    except:
        return None


def get_sentiment_examples():
    """Get example texts for sentiment analysis."""
    try:
        with open(f"{config.DEMO_SAMPLES_DIR}/sentiment.txt") as f:
            lines = f.read().strip().split("\n")
        return lines
    except:
        return config.TASKS["sentiment"]["example"].split()


def get_ner_examples():
    """Get example texts for NER."""
    try:
        with open(f"{config.DEMO_SAMPLES_DIR}/ner.txt") as f:
            lines = f.read().strip().split("\n")
        return lines
    except:
        return [config.TASKS["ner"]["example"]]


def get_qa_examples():
    """Get example context and questions for QA."""
    try:
        with open(f"{config.DEMO_SAMPLES_DIR}/qa.txt") as f:
            contexts = f.read().strip().split("\n\n")
        return contexts
    except:
        return [config.TASKS["qa"]["example_context"]]


def get_summarization_examples():
    """Get example texts for summarization."""
    try:
        with open(f"{config.DEMO_SAMPLES_DIR}/summarization.txt") as f:
            lines = f.read().strip().split("\n")
        return lines
    except:
        return [config.TASKS["summarization"]["example"]]


def get_embeddings_examples():
    """Get example texts for semantic similarity."""
    try:
        with open(f"{config.DEMO_SAMPLES_DIR}/embeddings.txt") as f:
            lines = f.read().strip().split("\n")
        return lines
    except:
        return [config.TASKS["similarity"]["example1"], config.TASKS["similarity"]["example2"]]


# ===================== SENTIMENT ANALYSIS =====================

@spaces.GPU
def demo_sentiment(text):
    """Demo sentiment analysis."""
    if not text.strip():
        return "Please enter some text", "No input"
    result = utils.run_sentiment_analysis(text)
    output = f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.4f}"
    return output, result


# ===================== NER =====================

@spaces.GPU
def demo_ner(text):
    """Demo named entity recognition."""
    if not text.strip():
        return "Please enter some text", "No entities"
    results = utils.run_ner(text)
    if results and isinstance(results, list) and isinstance(results[0], dict) and "error" in results[0]:
        return f"Error: {results[0]['error']}", {"error": results[0]["error"]}
    formatted = utils.format_ner_output(results)
    return formatted, results


# ===================== QUESTION ANSWERING =====================

@spaces.GPU
def demo_qa(context, question):
    """Demo question answering."""
    if not context.strip() or not question.strip():
        return "Please enter both context and question", {}
    result = utils.run_qa(context, question)
    if "error" in result:
        return f"Error: {result['error']}", {}
    output = f"**Answer:** {result['answer']}\n\n**Confidence:** {result['score']:.4f}"
    return output, result


# ===================== SUMMARIZATION =====================

@spaces.GPU
def demo_summarization(text):
    """Demo text summarization."""
    if not text.strip():
        return "Please enter some text", {"error": "Please enter some text"}
    if len(text.split()) < 20:
        return "Text too short for summarization. Please provide at least 20 words.", {
            "error": "Text too short for summarization. Please provide at least 20 words."
        }
    summary = utils.run_summarization(text)
    if summary.startswith("Error:"):
        return summary, {"error": summary}
    return summary, {"summary": summary}


# ===================== SEMANTIC SIMILARITY =====================

@spaces.GPU
def demo_similarity(text1, text2):
    """Demo semantic similarity."""
    if not text1.strip() or not text2.strip():
        return "Please enter both texts", 0
    similarity = utils.compute_similarity(text1, text2)
    if isinstance(similarity, str):
        return similarity, 0
    output = f"**Similarity Score:** {similarity:.4f}\n\n(Score ranges from -1 to 1, where 1 means identical semantically)"
    return output, similarity


# ===================== TOKENIZATION =====================

@spaces.GPU
def demo_tokenization(text):
    """Demo tokenization."""
    if not text.strip():
        return "Please enter some text", ""
    result = utils.tokenize_text(text)
    if "error" in result:
        return f"Error: {result['error']}", ""
    formatted = utils.format_tokenizer_output(result)
    return formatted, result


# ===================== GRADIO INTERFACE =====================

def create_interface():
    """Create the Gradio interface with 3 tabs."""
    
    with gr.Blocks(
        title="HuggingFace Enabling Sessions",
    ) as app:
        gr.Markdown(
            """
            # 🤗 HuggingFace Enabling Sessions
            **Interactive Demo for Transformers, Hub APIs, and Pipeline Abstractions**
            
            **Duration:** Session 1: 45 min | Session 2: 90 min
            """
        )
        
        with gr.Tabs():
            # ===================== TAB 1: SESSION 1 - INTRODUCTION =====================
            with gr.Tab("Session 1: Introduction (45 min)", id="session1"):
                gr.Markdown(
                    """
                    ## 🎯 Introduction to Hugging Face Ecosystem
                    
                    ### What We'll Cover:
                    1. **HuggingFace Platform Overview**
                       - The Hub: Central repository for models, datasets, and spaces
                       - Transformers Library: Core Python library for NLP
                       - Model Cards: Documentation and metadata for transparency
                    
                    2. **Core Abstractions**
                       - **Pipelines:** High-level API for common tasks (sentiment, NER, QA, etc.)
                       - **Models & Tokenizers:** Lower-level building blocks
                       - **Datasets:** Standardized data loading and processing
                    
                    3. **Architecture Patterns**
                       - **Encoders:** BERT, RoBERTa, DistilBERT → Classification, feature extraction
                       - **Decoders:** GPT-2, GPT-3 → Text generation
                       - **Encoder-Decoders:** T5, BART → Seq2seq (translation, summarization, QA)
                    
                    4. **Enterprise NLP Landscape**
                       - Open-source vs. Commercial models
                       - Licensing considerations (MIT, Apache, OpenRAIL, etc.)
                       - Fine-tuning for domain-specific tasks
                    
                    ---
                    
                    ### Live Demo: Explore the Power of Pipelines
                    
                    Try the demos below to see how easy it is to use pre-trained models! 👇
                    """
                )
                
                with gr.Group():
                    gr.Markdown("### 📊 Demo 1: Sentiment Analysis")
                    demo1_input = gr.Textbox(
                        label="Enter text to analyze sentiment",
                        value="I absolutely love this product!",
                        lines=2,
                    )
                    demo1_btn = gr.Button("Analyze Sentiment", variant="primary")
                    demo1_output = gr.Markdown(label="Result")
                    demo1_json = gr.JSON(label="Raw Output", visible=False)
                    
                    demo1_btn.click(
                        demo_sentiment,
                        inputs=[demo1_input],
                        outputs=[demo1_output, demo1_json],
                    )
                
                with gr.Group():
                    gr.Markdown("### 🏷️ Demo 2: Named Entity Recognition (NER)")
                    demo2_input = gr.Textbox(
                        label="Enter text for entity recognition",
                        value="Apple Inc. was founded by Steve Jobs in Cupertino, California.",
                        lines=2,
                    )
                    demo2_btn = gr.Button("Extract Entities", variant="primary")
                    demo2_output = gr.Markdown(label="Entities Found")
                    demo2_json = gr.JSON(label="Raw Output", visible=False)
                    
                    demo2_btn.click(
                        demo_ner,
                        inputs=[demo2_input],
                        outputs=[demo2_output, demo2_json],
                    )
                
                gr.Markdown(
                    """
                    ---
                    ### 💡 Key Takeaways
                    - Pre-trained models save time and resources
                    - HuggingFace Pipelines abstract away complexity
                    - Models are available for dozens of NLP tasks
                    - Easy to fine-tune for specialized use cases
                    
                    **Next:** Head to Session 2 for hands-on development with Tokenizers and Advanced Inference! 🚀
                    """
                )
            
            # ===================== TAB 2: SESSION 2 - HANDS-ON DEVELOPER =====================
            with gr.Tab("Session 2: Hands-On Developer (90 min)", id="session2"):
                gr.Markdown(
                    """
                    ## 👨‍💻 Building End-to-End NLP Workflows with Hugging Face
                    
                    ### Agenda:
                    1. **Tokenization Deep Dive** (15 min)
                       - Understanding tokenization, token IDs, and attention masks
                       - How models process text internally
                    
                    2. **Inference Playground** (45 min)
                       - Interactive demos across multiple NLP tasks
                       - Learn how to use different model architectures
                       - See real outputs and understand model confidence
                    
                    3. **Exercise Checkpoints** (20 min)
                       - Try your own text inputs
                       - Experiment with different examples
                       - Q&A and troubleshooting
                    
                    4. **Next Steps & Resources** (10 min)
                       - Publishing models to the Hub
                       - Fine-tuning workflow overview
                       - Post-session project ideas
                    
                    ---
                    
                    ### 🔤 Part 1: Tokenization Explorer
                    """
                )
                
                with gr.Group():
                    gr.Markdown(
                        """
                        #### How Tokenization Works
                        - Text is split into tokens (words/subwords)
                        - Each token gets a unique ID
                        - Attention masks indicate which tokens are real vs. padding
                        - This is how transformers \"understand\" text!
                        """
                    )
                    tok_input = gr.Textbox(
                        label="Enter text to tokenize",
                        value="Hello, how are you?",
                        lines=2,
                    )
                    tok_btn = gr.Button("Tokenize", variant="primary")
                    tok_output = gr.Markdown(label="Tokens")
                    tok_json = gr.JSON(label="Tokenization Details", visible=False)
                    
                    tok_btn.click(
                        demo_tokenization,
                        inputs=[tok_input],
                        outputs=[tok_output, tok_json],
                    )
                
                gr.Markdown(
                    """
                    ---
                    ### 🎯 Part 2: Inference Playground (Choose a Task)
                    """
                )
                
                with gr.Tabs():
                    # Task 1: Sentiment
                    with gr.Tab("Sentiment Analysis"):
                        gr.Markdown(
                            """
                            **Classify text as positive, negative, or neutral**
                            
                            Model: DistilBERT fine-tuned on SST-2 dataset
                            """
                        )
                        sent_input = gr.Textbox(
                            label="Enter text",
                            value=get_sentiment_examples()[0] if get_sentiment_examples() else "I love this!",
                            lines=3,
                        )
                        sent_btn = gr.Button("Analyze", variant="primary")
                        sent_output = gr.Markdown(label="Result")
                        sent_json = gr.JSON(label="Details", visible=False)
                        
                        sent_btn.click(
                            demo_sentiment,
                            inputs=[sent_input],
                            outputs=[sent_output, sent_json],
                        )
                    
                    # Task 2: NER
                    with gr.Tab("Named Entity Recognition"):
                        gr.Markdown(
                            """
                            **Identify people, organizations, locations, and more**
                            
                            Model: BERT fine-tuned on CoNLL-2003 NER dataset
                            """
                        )
                        ner_input = gr.Textbox(
                            label="Enter text",
                            value=get_ner_examples()[0] if get_ner_examples() else "Apple Inc. was founded by Steve Jobs",
                            lines=3,
                        )
                        ner_btn = gr.Button("Extract Entities", variant="primary")
                        ner_output = gr.Markdown(label="Entities")
                        ner_json = gr.JSON(label="Details", visible=False)
                        
                        ner_btn.click(
                            demo_ner,
                            inputs=[ner_input],
                            outputs=[ner_output, ner_json],
                        )
                    
                    # Task 3: QA
                    with gr.Tab("Question Answering"):
                        gr.Markdown(
                            """
                            **Answer questions based on provided context**
                            
                            Model: RoBERTa fine-tuned on SQuAD 2.0
                            """
                        )
                        qa_examples = get_qa_examples()
                        qa_context = gr.Textbox(
                            label="Context/Passage",
                            value=qa_examples[0] if qa_examples else config.TASKS["qa"]["example_context"],
                            lines=4,
                        )
                        qa_question = gr.Textbox(
                            label="Question",
                            value="What is the Hugging Face Hub?",
                            lines=2,
                        )
                        qa_btn = gr.Button("Get Answer", variant="primary")
                        qa_output = gr.Markdown(label="Answer")
                        qa_json = gr.JSON(label="Details", visible=False)
                        
                        qa_btn.click(
                            demo_qa,
                            inputs=[qa_context, qa_question],
                            outputs=[qa_output, qa_json],
                        )
                    
                    # Task 4: Summarization
                    with gr.Tab("Text Summarization"):
                        gr.Markdown(
                            """
                            **Generate concise summaries of longer texts**
                            
                            Model: BART large fine-tuned on CNN/DailyMail
                            """
                        )
                        sum_examples = get_summarization_examples()
                        sum_input = gr.Textbox(
                            label="Text to summarize (min 20 words)",
                            value=sum_examples[0] if sum_examples else config.TASKS["summarization"]["example"],
                            lines=5,
                        )
                        sum_btn = gr.Button("Summarize", variant="primary")
                        sum_output = gr.Markdown(label="Summary")
                        sum_json = gr.JSON(label="Details", visible=False)
                        
                        sum_btn.click(
                            demo_summarization,
                            inputs=[sum_input],
                            outputs=[sum_output, sum_json],
                        )
                    
                    # Task 5: Semantic Similarity
                    with gr.Tab("Semantic Similarity"):
                        gr.Markdown(
                            """
                            **Compare semantic similarity between texts**
                            
                            Model: Sentence-BERT (all-MiniLM-L6-v2)
                            """
                        )
                        emb_examples = get_embeddings_examples()
                        emb_text1 = gr.Textbox(
                            label="First text",
                            value=emb_examples[0] if len(emb_examples) > 0 else "The cat is sleeping",
                            lines=2,
                        )
                        emb_text2 = gr.Textbox(
                            label="Second text",
                            value=emb_examples[1] if len(emb_examples) > 1 else "A feline is resting",
                            lines=2,
                        )
                        emb_btn = gr.Button("Compare", variant="primary")
                        emb_output = gr.Markdown(label="Similarity")
                        emb_json = gr.JSON(label="Details", visible=False)
                        
                        emb_btn.click(
                            demo_similarity,
                            inputs=[emb_text1, emb_text2],
                            outputs=[emb_output, emb_json],
                        )
                
                gr.Markdown(
                    """
                    ---
                    ### 🚀 Part 3: Key Concepts Recap
                    
                    ✅ **Transformers Architecture:**
                    - Self-attention mechanisms allow models to focus on relevant parts of text
                    - Pre-training on large corpora + fine-tuning = transfer learning
                    
                    ✅ **Using HuggingFace:**
                    - Pipelines for quick demos
                    - Fine-tuning for custom tasks
                    - Model Hub for sharing and collaboration
                    
                    ✅ **Production Considerations:**
                    - Model size vs. accuracy tradeoff
                    - Quantization and distillation for faster inference
                    - Licensing and compliance for models
                    """
                )
            
            # ===================== TAB 3: RESOURCES =====================
            with gr.Tab("Resources & Next Steps", id="resources"):
                gr.Markdown(
                    """
                    ## 📚 Learning Resources
                    
                    ### Official Documentation
                    - [Hugging Face Transformers Documentation](https://huggingface.co/docs/transformers/)
                    - [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets/)
                    - [Hugging Face Hub Documentation](https://huggingface.co/docs/hub/)
                    
                    ### Tutorials & Courses
                    - [Hugging Face Course (Free)](https://huggingface.co/course/)
                    - [Transformers from Scratch](https://huggingface.co/docs/transformers/training)
                    - [Fine-tuning Guide](https://huggingface.co/docs/transformers/training)
                    
                    ---
                    
                    ## 🛠️ Popular Models to Explore
                    
                    ### Text Classification
                    - `distilbert-base-uncased-finetuned-sst-2-english` - Sentiment Analysis
                    - `roberta-base` - General purpose classifier
                    - `bert-base-multilingual-cased` - Multilingual support
                    
                    ### Named Entity Recognition
                    - `dslim/bert-base-NER` - English NER
                    - `xlm-roberta-base` - Multilingual NER
                    
                    ### Question Answering
                    - `deepset/roberta-base-squad2` - SQuAD 2.0 fine-tuned
                    - `bert-large-uncased-whole-word-masking-finetuned-squad` - BERT Large
                    
                    ### Text Generation
                    - `gpt2` - Lightweight generation
                    - `facebook/bart-large` - Sequence-to-sequence
                    - `google/t5-base` - T5 for various tasks
                    
                    ### Embeddings & Similarity
                    - `sentence-transformers/all-MiniLM-L6-v2` - Fast & efficient
                    - `sentence-transformers/all-mpnet-base-v2` - High quality
                    
                    ---
                    
                    ## 💾 Popular Datasets
                    
                    - `glue` - General Language Understanding Evaluation
                    - `wikitext` - Large language model benchmark
                    - `squad` - Question answering dataset
                    - `conll2003` - Named entity recognition
                    - `imdb` - Sentiment analysis
                    
                    ---
                    
                    ## 🎯 Next Steps After the Sessions
                    
                    ### Beginner Path
                    1. Explore models on the Hub
                    2. Try different models on your own data
                    3. Learn about fine-tuning concepts
                    
                    ### Intermediate Path
                    1. Fine-tune a pre-trained model on your dataset
                    2. Deploy a model to Spaces (like this demo!)
                    3. Publish your model to the Hub
                    
                    ### Advanced Path
                    1. Build multi-stage pipelines
                    2. Implement custom training loops
                    3. Contribute to open-source projects
                    
                    ---
                    
                    ## 🔗 Community & Support
                    
                    - [Hugging Face Forums](https://discuss.huggingface.co/)
                    - [GitHub Issues](https://github.com/huggingface/transformers/issues)
                    - [Twitter/X @huggingface](https://twitter.com/huggingface)
                    - Company Slack/Teams Channels
                    
                    ---
                    
                    ## 📝 Session Information
                    
                    **Session 1: Introduction to Hugging Face** (45 minutes)
                    - Overview of the ecosystem
                    - Core abstractions (Pipelines, Models, Tokenizers)
                    - Architecture patterns
                    - Enterprise considerations
                    
                    **Session 2: Hands-On Developer Workshop** (90 minutes)
                    - Tokenization deep dive
                    - Interactive inference playground (5+ NLP tasks)
                    - Live coding and experimentation
                    - Best practices and next steps
                    
                    ---
                    
                    ### Questions? 
                    Feel free to reach out via Slack or email during the sessions! 💬
                    """
                )
    
    return app


app = create_interface()


# ===================== MAIN =====================

if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False,
    )