import gradio as gr
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# --- 1. NLTK Resource Downloads ---
# Ensure necessary NLTK data is downloaded for running on fresh environments (like Hugging Face)
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except nltk.downloader.DownloadError:
    nltk.download('wordnet')

# Initialize NLP resources outside the main function for efficiency
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

# --- 2. Helper Preprocessing Functions (Based on User's Outline) ---

def remove_non_ascii(words):
    """Removes non-ASCII characters from a list of tokens."""
    return [re.sub(r'[^\x00-\x7F]+', '', word) for word in words]

def to_lowercase(words):
    """Converts all tokens to lowercase."""
    return [word.lower() for word in words]

def remove_punctuation(words):
    """Removes punctuation tokens."""
    # We remove punctuation marks themselves, and also filter out empty strings
    return [word for word in words if word not in string.punctuation and word != '']

def remove_stopwords(words):
    """Removes common English stop words."""
    # Note: Punctuation must be removed *before* this step, otherwise it might be preserved.
    return [word for word in words if word not in STOP_WORDS]

def lemmatize_list(words):
    """Lemmatizes tokens to their base form (lemma)."""
    # Using 'v' for verb is a common heuristic if POS tagging is skipped for simplicity
    return [LEMMATIZER.lemmatize(word, pos='v') for word in words]

# --- 3. Main Normalization Function with Step Tracking ---

def normalize_with_steps(text):
    """
    Full preprocessing pipeline that tracks the output at each step.
    The Gradio output will show the step-by-step transformation.
    """
    output_log = f"**Input Text:**\n`{text}`\n\n---"

    # Step 1: Tokenization
    words = word_tokenize(text)
    output_log += f"\n\n**Step 1: Tokenization (nltk.word_tokenize)**\nBreaks text into individual words, including punctuation.\n`{words}`"

    # Step 2: Remove Non-ASCII
    words_step2 = remove_non_ascii(words)
    output_log += f"\n\n**Step 2: Remove Non-ASCII**\nRemoves characters outside the standard ASCII set.\n`{words_step2}`"
    words = words_step2 # Update for next step

    # Step 3: To Lowercase
    words_step3 = to_lowercase(words)
    output_log += f"\n\n**Step 3: To Lowercase**\nConverts all characters to lowercase for consistency.\n`{words_step3}`"
    words = words_step3

    # Step 4: Remove Punctuation
    words_step4 = remove_punctuation(words)
    output_log += f"\n\n**Step 4: Remove Punctuation**\nFilters out tokens that are only punctuation marks.\n`{words_step4}`"
    words = words_step4

    # Step 5: Remove Stopwords
    words_step5 = remove_stopwords(words)
    output_log += f"\n\n**Step 5: Remove Stopwords**\nRemoves common, less meaningful words (e.g., 'the', 'is', 'a').\n`{words_step5}`"
    words = words_step5

    # Step 6: Lemmatization
    words_step6 = lemmatize_list(words)
    output_log += f"\n\n**Step 6: Lemmatization (WordNetLemmatizer)**\nReduces words to their dictionary root form (e.g., 'running' -> 'run').\n`{words_step6}`"
    words = words_step6

    # Step 7: Join Final Tokens
    final_text = ' '.join(words)
    output_log += f"\n\n---"
    output_log += f"\n\n**Final Output (Joined Text):**\n`{final_text}`"

    return output_log

# --- 4. Gradio Interface Setup ---

# Example Inputs for the user to try
examples = [
    ["The dogs are running very quickly in the parks and they aren't stopping!"],
    ["Washingtón, D.C. has a lot of interesting historical monuments. $1000 is needed."],
    ["I've been playing football for years, but now I'm traveling to London."]
]

# Create the Gradio interface
iface = gr.Interface(
    fn=normalize_with_steps,
    inputs=gr.Textbox(
        lines=3,
        label="Enter Text to Preprocess",
        placeholder="Type a sentence here...",
        value=examples[0][0] # Set default value
    ),
    outputs=gr.Markdown(
        label="NLP Preprocessing Steps (Step-by-Step Transformation)",
    ),
    title="NLP Text Normalization Pipeline Demo",
    description="This application demonstrates a full text preprocessing pipeline using NLTK, showing the transformation after each step: Tokenization, Lowercasing, Punctuation Removal, Stopword Removal, and Lemmatization.",
    examples=examples,
    allow_flagging='never'
)

# Launch the app (Setting share=True allows for deployment)
# iface.launch(share=True) # Use this command if running locally or in a notebook
# For Hugging Face deployment, the app is launched via the `app.py` execution environment.
iface.launch()