Spaces:

bushra1dajam
/

Wajeez

Build error

App Files Files Community

bushra1dajam commited on Nov 15, 2024

Commit

ea88a50

verified ·

1 Parent(s): a933190

Upload 4 files

Browse files

Files changed (4) hide show

app.py +218 -0
logo.png +0 -0
svm_model.pkl +3 -0
tfidf_vectorizer.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import torch
+import transformers
+from transformers import AutoTokenizer, AutoModel , AutoModelForCausalLM
+from transformers import AutoModelForSeq2SeqLM
+import pickle
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import nltk
+from nltk.tokenize import word_tokenize
+import re
+import string
+from nltk.corpus import stopwords
+from tashaphyne.stemming import ArabicLightStemmer
+import pyarabic.araby as araby
+from sklearn.feature_extraction.text import TfidfVectorizer
+import streamlit as st
+nltk.download('punkt')
+with open('tfidf_vectorizer.pkl', 'rb') as f:
+    vectorizer = pickle.load(f)
+with open('svm_model.pkl', 'rb') as f:
+    model_classify = pickle.load(f)
+model = AutoModelForSeq2SeqLM.from_pretrained("bushra1dajam/AraBART")
+tokenizer = AutoTokenizer.from_pretrained('bushra1dajam/AraBART')
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+def summarize_text(text):
+    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    summary_ids = model.generate(
+    inputs["input_ids"],
+    max_length=512,
+    num_beams=8,
+    #no_repeat_ngram_size=4,  # Prevents larger n-gram repetitions
+    early_stopping=True)
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return summary
+def remove_numbers(text):
+    cleaned_text = re.sub(r'\d+', '', text)
+    return cleaned_text
+def Removing_non_arabic(text):
+    text =re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9٠-٩]+', ' ',text)
+    return text
+nltk.download('stopwords')
+ara_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation
+stop_words = stopwords.words()
+def remove_punctuations(text):
+    translator = str.maketrans('', '', ara_punctuations)
+    text = text.translate(translator)
+    return text
+def remove_tashkeel(text):
+    text = text.strip()
+    text = re.sub("[إأٱآا]", "ا", text)
+    text = re.sub("ى", "ي", text)
+    text = re.sub("ؤ", "ء", text)
+    text = re.sub("ئ", "ء", text)
+    text = re.sub("ة", "ه", text)
+    noise = re.compile(""" ّ    | # Tashdid
+                             َ    | # Fatha
+                             ً    | # Tanwin Fath
+                             ُ    | # Damma
+                             ٌ    | # Tanwin Damm
+                             ِ    | # Kasra
+                             ٍ    | # Tanwin Kasr
+                             ْ    | # Sukun
+                             ـ     # Tatwil/Kashida
+                         """, re.VERBOSE)
+    text = re.sub(noise, '', text)
+    text = re.sub(r'(.)\1+', r"\1\1", text)
+    return araby.strip_tashkeel(text)
+arabic_stopwords = stopwords.words("arabic")
+def remove_stop_words(text):
+    Text=[i for i in str(text).split() if i not in arabic_stopwords]
+    return " ".join(Text)
+def tokenize_text(text):
+    tokens = word_tokenize(text)
+    return tokens
+def Arabic_Light_Stemmer(text):
+    Arabic_Stemmer = ArabicLightStemmer()
+    text=[Arabic_Stemmer.light_stem(y) for y in text]
+    return " " .join(text)
+def preprocess_text(text):
+    text = remove_numbers(text)
+    text = Removing_non_arabic(text)
+    text = remove_punctuations(text)
+    text = remove_stop_words(text)
+    text = remove_tashkeel(text)
+    text = tokenize_text(text)
+    text = Arabic_Light_Stemmer(text)
+    return text
+class_mapping = {
+    0: "جنائية",
+    1: "احوال شخصية",
+    2: "عامة"
+}
+st.markdown("""
+    <style>
+        body {
+            background-color: #f0f4f8;
+            direction: rtl;
+            font-family: 'Arial', sans-serif;
+        }
+        .logo-container {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            margin-bottom: 20px;
+        }
+        .stTextArea textarea, .stText {
+            text-align: right;
+        }
+        .stButton>button {
+            background-color: #3498db;
+            color: white;
+            font-family: 'Arial', sans-serif;
+        }
+        .stButton>button:hover {
+            background-color: #2980b9;
+        }
+        h1, h2, h3, h4, h5, h6, .stSubheader {
+            text-align: right;
+        }
+        .home-title {
+            text-align: center;
+            font-size: 40px;
+            color: #3498db;
+        }
+        .home-description {
+            text-align: center;
+            font-size: 20px;
+            color: #2c3e50;
+        }
+        .larger-text {
+            font-size: 24px;
+            color: #2c3e50;
+        }
+    </style>
+""", unsafe_allow_html=True)
+# Function for the Home Page
+def home_page():
+    st.markdown('<h1 class="home-title">مرحبا بك في تطبيق وجيز</h1>', unsafe_allow_html=True)
+    st.markdown('<p class="home-description">تطبيق وجيز يقدم لك خدمة التصنيف والملخص للنصوص القانونية. يمكنك إدخال النصوص هنا للحصول على تصنيف دقيق وملخص شامل.</p>', unsafe_allow_html=True)
+def main_page():
+    st.title("صنف ولخص")
+    # Input text area
+    input_text = st.text_area("ادخل النص", "")
+    if st.button('صنف ولخص'):
+        if input_text:
+            prepro = preprocess_text(input_text)
+            features = vectorizer.transform([prepro])
+            prediction = model_classify.predict(features)
+            classifiy = prediction[0]
+            classifiy_class = class_mapping.get(classifiy, "لم يتم التعرف")
+            # Generate the summarized text
+            summarized_text = summarize_text(input_text)
+            st.markdown('<p class="larger-text">تصنيف القضية :</p>', unsafe_allow_html=True)
+            st.write(classifiy_class)
+            st.markdown('<p class="larger-text">ملخص للقضية :</p>', unsafe_allow_html=True)
+            st.write(summarized_text)
+def app():
+    # Sidebar navigation with logo inside the sidebar
+    with st.sidebar:
+        st.markdown('<div class="logo-container">', unsafe_allow_html=True)
+        st.image("logo.png", width=200)  # Make sure you have the logo file in your app folder
+        st.markdown('</div>', unsafe_allow_html=True)
+        st.header("تطييق وجيز")
+        page_selection = st.selectbox("اختر صفحة", ["الرئيسية", " صنف ولخص !"])
+    if page_selection == "الرئيسية":
+        home_page()
+    elif page_selection == " صنف ولخص !":
+        main_page()
+if __name__ == "__main__":
+    app()

logo.png ADDED Viewed

svm_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56e1780885b58ab910fe9ac58d65ea5f0ddfb81e1527d6e2c0296b39b8a53351
+size 1625610

tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a69fa5f5c65c4043d928a2b1350315e12709b89b647340ba86b2c08cacefb0d
+size 231319