Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from tamil_lemmatizer import TamilLemmatizer | |
| import re | |
| # Title and description for Hugging Face Spaces | |
| st.set_page_config(page_title="Tamil Lemmatizer", page_icon="🎯") | |
| def clean_word(word): | |
| # Remove all punctuation except Tamil characters | |
| return re.sub(r"[^\w\u0B80-\u0BFF]", "", word) | |
| def lemma_highlight(paragraph, search_word, lemmatizer): | |
| words = paragraph.split() | |
| # Clean words before lemmatizing | |
| cleaned_words = [clean_word(w) for w in words] | |
| # Lemmatize words | |
| lemma_words = [(words[i], lemmatizer.lemmatize(cleaned_words[i])) for i in range(len(words))] | |
| # Lemma of search term (cleaned) | |
| search_lemma = lemmatizer.lemmatize(clean_word(search_word.strip())) | |
| output = [] | |
| for original, lemma in lemma_words: | |
| if lemma == search_lemma: | |
| output.append(f"<span style='background-color: yellow; padding: 3px; color: black;'>{original}</span>") | |
| else: | |
| output.append(original) | |
| highlighted_text = " ".join(output) | |
| return highlighted_text | |
| def main(): | |
| st.title("Tamil Lemmatizer 🎯") | |
| # Load model (cached) | |
| def load_model(): | |
| return TamilLemmatizer() | |
| lemmatizer = load_model() | |
| tab1, tab2 = st.tabs(["Lemmatizer", "Contextual Lemma Finder"]) | |
| with tab1: | |
| st.write("Inflected Tamil word → Base lemma") | |
| st.write("Try any inflected Tamil word to get its base form!") | |
| # Sample words | |
| sample_words = [ | |
| "பாடிக்கொண்டிருந்தாள்", | |
| "நடந்தார்கள்", | |
| "எழுதியிருக்கிறேன்", | |
| "சாப்பிடவில்லை", | |
| "போனார்கள்" | |
| ] | |
| st.subheader("Try these sample words:") | |
| cols = st.columns(5) | |
| for i, w in enumerate(sample_words): | |
| if cols[i].button(w): | |
| st.session_state["word"] = w | |
| # Input box (stores clicked word) | |
| word = st.text_input("Enter a Tamil word:", st.session_state.get("word", "")) | |
| # Run inference | |
| if word.strip(): | |
| try: | |
| lemma = lemmatizer.lemmatize(word.strip()) | |
| st.success(f"Lemma : **{lemma}**") | |
| except Exception as e: | |
| st.error(f"Error: {e}") | |
| with tab2: | |
| st.header("Contextual Lemma Finder") | |
| st.write("Find all occurrences of a word's lemma in a paragraph.") | |
| default_paragraph = "அவர்கள் நேற்று சந்தித்தபோது பேசிக்கொண்டிருந்த நிகழ்வுகளை இன்று அவர் நண்பர்களிடம் பேசுகிறார். குழந்தைகள் அந்த விஷயத்தைப் பற்றி ஒன்றோ இரண்டோ வரிகளாக பேசுகின்றனர், மேலும் பெரியவர்கள் அதை விவாதிப்பதாக பேசுகிறார்கள். சில முக்கியமான தகவல்களை ஆசிரியர் அனைவருக்கும் பேசுகிறது என்று மாணவர்கள் நினைக்கிறார்கள்." | |
| default_search_word = "பேசினார்" | |
| paragraph = st.text_area("Enter Tamil Paragraph", value=default_paragraph, height=150) | |
| search_word = st.text_input("Word to Search (lemma-based)", value=default_search_word) | |
| if st.button("Find Lemma"): | |
| if paragraph and search_word: | |
| try: | |
| result_html = lemma_highlight(paragraph, search_word, lemmatizer) | |
| st.markdown(result_html, unsafe_allow_html=True) | |
| except Exception as e: | |
| st.error(f"Error: {e}") | |
| else: | |
| st.warning("Please enter both a paragraph and a search word.") | |
| st.write("---") | |
| st.subheader("About this Space") | |
| st.write( | |
| """ | |
| This demo uses a character-level seq2seq Tamil lemmatization model | |
| to convert inflected Tamil words into their base lemma. | |
| **Repository:** | |
| https://github.com/Hemanthkumar2112/tamil-lemmatizer | |
| **PyPI Package:** | |
| `pip install tamil-lemmatizer` | |
| https://pypi.org/project/tamil-lemmatizer/ | |
| **Contact:** | |
| hemanthmurugan21@gmail.com | |
| """ | |
| ) | |
| st.write("---") | |
| st.subheader("Citation") | |
| st.code( | |
| """@misc{tamil_lemmatizer, | |
| author = {Hemanth Thunder}, | |
| title = {Tamil Lemmatizer: A Neural Lemmatization Model for Tamil}, | |
| year = {2025}, | |
| publisher = {GitHub}, | |
| url = {https://github.com/hemanthkumar2112/tamil-lemmatizer} | |
| }""", | |
| language="bibtex" | |
| ) | |
| if __name__ == "__main__": | |
| main() |