Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from fuzzywuzzy import fuzz | |
| from fuzzywuzzy import process | |
| import string | |
| from nltk.corpus import stopwords | |
| import nltk | |
| import io | |
| nltk.download('stopwords') | |
| # Helper functions | |
| def clean_text(text): | |
| text = text.lower() | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| return text | |
| def clean_text_fuzzy(text): | |
| stop_words = set(stopwords.words('english')) | |
| text = text.lower() | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| words = text.split() | |
| words = [word for word in words if word not in stop_words] | |
| return " ".join(words) | |
| def process_matching(keywords, article, fuzzy, mode): | |
| keywords = [clean_text(k) for k in keywords] | |
| article_exact = clean_text(article) | |
| article_fuzzy = clean_text_fuzzy(article) | |
| results = {} | |
| max_keyword_length = max(len(k.split()) for k in keywords) | |
| for keyword in keywords: | |
| if keyword not in results: | |
| results[keyword] = 0 | |
| if fuzzy: | |
| for n in range(1, max_keyword_length + 1): | |
| n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)] | |
| matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None) | |
| results[keyword] += sum(1 for match, score in matches if score > 90) | |
| else: | |
| for n in range(1, max_keyword_length + 1): | |
| n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)] | |
| results[keyword] += n_grams_exact.count(keyword) | |
| if mode == "filter": | |
| results = {k: v for k, v in results.items() if v > 0} | |
| total_count = sum(results.values()) | |
| return results, total_count | |
| # Streamlit app | |
| st.title("Keyword Matcher") | |
| # Mode selection | |
| mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True) | |
| mode = "frequency" if mode == "Keyword Frequency" else "filter" | |
| # Keyword input | |
| st.subheader("Keywords") | |
| keywords_input = st.text_area("Enter keywords (comma separated):") | |
| uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"]) | |
| keywords = [] | |
| if uploaded_file: | |
| if uploaded_file.name.endswith(".csv"): | |
| df = pd.read_csv(uploaded_file) | |
| else: | |
| df = pd.read_excel(uploaded_file) | |
| if not df.empty: | |
| keywords = df.iloc[:, 0].dropna().tolist() | |
| else: | |
| keywords = [k.strip() for k in keywords_input.split(",") if k.strip()] | |
| # Article input | |
| st.subheader("Article") | |
| article = st.text_area("Paste the article text here:") | |
| # Fuzzy matching checkbox | |
| fuzzy = st.checkbox("Enable Fuzzy Matching") | |
| # Process button | |
| if st.button("Process"): | |
| if not keywords: | |
| st.error("Please provide keywords.") | |
| elif not article: | |
| st.error("Please provide an article.") | |
| else: | |
| results, total_count = process_matching(keywords, article, fuzzy, mode) | |
| st.subheader("Results") | |
| for keyword, count in results.items(): | |
| st.write(f"{keyword}: {count}") | |
| st.write(f"**Total Count:** {total_count}") | |
| # Save to Excel | |
| st.subheader("Download Results") | |
| df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"]) | |
| output = io.BytesIO() | |
| with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
| df.to_excel(writer, index=False, sheet_name="Results") | |
| output.seek(0) | |
| st.download_button( | |
| label="Download Results as Excel", | |
| data=output, | |
| file_name="results.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |