Mohamed-Maher's picture
Upload app.py
18feff1 verified
raw
history blame
1.46 kB
import re
import nltk
import pickle
import numpy as np
import pandas as pd
import streamlit as st
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
dataset = pd.read_csv("Preprocess_LK_Hadith_dataset.csv")
labels = dataset['Arabic_Grade']
# Helper functions
def remove_tashkeel(text):
tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
return re.sub(tashkeel_pattern, '', text)
def preprocess_arabic_text(text):
text = remove_tashkeel(text)
tokens = nltk.word_tokenize(text)
cleaned_tokens = [token for token in tokens if token.isalnum()]
lowercase_tokens = [token.lower() for token in cleaned_tokens]
return " ".join(lowercase_tokens)
# Function to predict label
def predict_label(input_text, threshold=0.5):
with open("tfidf_vectorizer.pkl", "rb") as f:
vectorizer = pickle.load(f)
with open("cosine_similarity_model.pkl", "rb") as f:
X = pickle.load(f)
input_text = preprocess_arabic_text(input_text)
input_vector = vectorizer.transform([input_text])
similarities = cosine_similarity(input_vector, X).flatten()
max_index = np.argmax(similarities)
max_similarity = similarities[max_index]
if max_similarity >= threshold:
return labels.iloc[max_index]
else:
return "No similar text found in dataset"
x = st.slider('Enter Hadith')
st.write(x, 'Hadith Classification', predict_label)