Spaces:

Hemanth-thunder
/

tamil_lemmatizer

Sleeping

App Files Files Community

tamil_lemmatizer / src /streamlit_app.py

Hemanth-thunder

Update src/streamlit_app.py

4c5a059 verified 5 months ago

raw

history blame contribute delete

4.97 kB

	import streamlit as st
	from tamil_lemmatizer import TamilLemmatizer
	import re

	# Title and description for Hugging Face Spaces
	st.set_page_config(page_title="Tamil Lemmatizer", page_icon="🎯")

	def clean_word(word):
	# Remove all punctuation except Tamil characters
	return re.sub(r"[^\w\u0B80-\u0BFF]", "", word)

	def lemma_highlight(paragraph, search_word, lemmatizer):
	words = paragraph.split()

	# Clean words before lemmatizing
	cleaned_words = [clean_word(w) for w in words]

	# Lemmatize words
	lemma_words = [(words[i], lemmatizer.lemmatize(cleaned_words[i])) for i in range(len(words))]

	# Lemma of search term (cleaned)
	search_lemma = lemmatizer.lemmatize(clean_word(search_word.strip()))

	output = []
	for original, lemma in lemma_words:
	if lemma == search_lemma:
	output.append(f"<span style='background-color: yellow; padding: 3px; color: black;'>{original}</span>")
	else:
	output.append(original)

	highlighted_text = " ".join(output)
	return highlighted_text

	def main():
	st.title("Tamil Lemmatizer 🎯")

	# Load model (cached)
	@st.cache_resource
	def load_model():
	return TamilLemmatizer()

	lemmatizer = load_model()

	tab1, tab2 = st.tabs(["Lemmatizer", "Contextual Lemma Finder"])

	with tab1:
	st.write("Inflected Tamil word → Base lemma")
	st.write("Try any inflected Tamil word to get its base form!")

	# Sample words
	sample_words = [
	"பாடிக்கொண்டிருந்தாள்",
	"நடந்தார்கள்",
	"எழுதியிருக்கிறேன்",
	"சாப்பிடவில்லை",
	"போனார்கள்"
	]

	st.subheader("Try these sample words:")
	cols = st.columns(5)
	for i, w in enumerate(sample_words):
	if cols[i].button(w):
	st.session_state["word"] = w

	# Input box (stores clicked word)
	word = st.text_input("Enter a Tamil word:", st.session_state.get("word", ""))

	# Run inference
	if word.strip():
	try:
	lemma = lemmatizer.lemmatize(word.strip())
	st.success(f"Lemma : {lemma}")
	except Exception as e:
	st.error(f"Error: {e}")

	with tab2:
	st.header("Contextual Lemma Finder")
	st.write("Find all occurrences of a word's lemma in a paragraph.")

	default_paragraph = "அவர்கள் நேற்று சந்தித்தபோது பேசிக்கொண்டிருந்த நிகழ்வுகளை இன்று அவர் நண்பர்களிடம் பேசுகிறார். குழந்தைகள் அந்த விஷயத்தைப் பற்றி ஒன்றோ இரண்டோ வரிகளாக பேசுகின்றனர், மேலும் பெரியவர்கள் அதை விவாதிப்பதாக பேசுகிறார்கள். சில முக்கியமான தகவல்களை ஆசிரியர் அனைவருக்கும் பேசுகிறது என்று மாணவர்கள் நினைக்கிறார்கள்."
	default_search_word = "பேசினார்"

	paragraph = st.text_area("Enter Tamil Paragraph", value=default_paragraph, height=150)
	search_word = st.text_input("Word to Search (lemma-based)", value=default_search_word)

	if st.button("Find Lemma"):
	if paragraph and search_word:
	try:
	result_html = lemma_highlight(paragraph, search_word, lemmatizer)
	st.markdown(result_html, unsafe_allow_html=True)
	except Exception as e:
	st.error(f"Error: {e}")
	else:
	st.warning("Please enter both a paragraph and a search word.")

	st.write("---")
	st.subheader("About this Space")
	st.write(
	"""
	This demo uses a character-level seq2seq Tamil lemmatization model
	to convert inflected Tamil words into their base lemma.
	Repository:
	https://github.com/Hemanthkumar2112/tamil-lemmatizer
	PyPI Package:
	`pip install tamil-lemmatizer`
	https://pypi.org/project/tamil-lemmatizer/
	Contact:
	hemanthmurugan21@gmail.com
	"""
	)

	st.write("---")
	st.subheader("Citation")
	st.code(
	"""@misc{tamil_lemmatizer,
	author = {Hemanth Thunder},
	title = {Tamil Lemmatizer: A Neural Lemmatization Model for Tamil},
	year = {2025},
	publisher = {GitHub},
	url = {https://github.com/hemanthkumar2112/tamil-lemmatizer}
	}""",
	language="bibtex"
	)

	if __name__ == "__main__":
	main()