Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| from datasets import load_dataset | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| nltk.download('punkt') | |
| dataset = pd.read_csv("Preprocess_LK_Hadith_dataset.csv") | |
| labels = dataset['Arabic_Grade'] | |
| # Helper functions | |
| def remove_tashkeel(text): | |
| tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]') | |
| return re.sub(tashkeel_pattern, '', text) | |
| def preprocess_arabic_text(text): | |
| text = remove_tashkeel(text) | |
| tokens = nltk.word_tokenize(text) | |
| cleaned_tokens = [token for token in tokens if token.isalnum()] | |
| lowercase_tokens = [token.lower() for token in cleaned_tokens] | |
| return " ".join(lowercase_tokens) | |
| # Function to predict label | |
| def predict_label(input_text, threshold=0.5): | |
| with open("tfidf_vectorizer.pkl", "rb") as f: | |
| vectorizer = pickle.load(f) | |
| with open("cosine_similarity_model.pkl", "rb") as f: | |
| X = pickle.load(f) | |
| input_text = preprocess_arabic_text(input_text) | |
| input_vector = vectorizer.transform([input_text]) | |
| similarities = cosine_similarity(input_vector, X).flatten() | |
| max_index = np.argmax(similarities) | |
| max_similarity = similarities[max_index] | |
| if max_similarity >= threshold: | |
| return labels.iloc[max_index] | |
| else: | |
| return "No similar text found in dataset" | |
| x = st.slider('Enter Hadith') | |
| st.write(x, 'Hadith Classification', predict_label) |