Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """model.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1lKXL4Cdum5DiSbczUsadXc0F8j46NM_m | |
| # in the name of **allah** | |
| """ | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from transformers import AutoTokenizer, BertForSequenceClassification,AutoConfig | |
| from datasets import Dataset | |
| import pandas as pd | |
| import os | |
| import re | |
| from hazm import Normalizer, Lemmatizer, word_tokenize, stopwords_list | |
| # Initialize Hazm components | |
| normalizer = Normalizer() | |
| lemmatizer = Lemmatizer() | |
| stopwords = stopwords_list() | |
| # Load the BERT model for sentiment analysis | |
| dataset = Dataset.from_pandas(pd.DataFrame({"Comment": []})) | |
| # بارگذاری مدل و توکنایزر | |
| model_name = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-multi" | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # انتخاب دستگاه (GPU یا CPU) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| # Tokenization function for sentiment analysis | |
| def tokenize_function(examples): | |
| return tokenizer(examples["Comment"], padding="max_length", truncation=True, max_length=128, return_tensors='pt') | |
| # Sentiment prediction function | |
| def predict_sentiment(batch): | |
| input_ids = torch.tensor(batch['input_ids']).to(device) | |
| attention_mask = torch.tensor(batch['attention_mask']).to(device) | |
| with torch.no_grad(): | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
| predictions = torch.argmax(outputs.logits, dim=-1) | |
| return {'sentiment': predictions.cpu()} | |
| # Mapping sentiment labels | |
| sentiment_labels = { | |
| 0: 'بسیار عصیانی', | |
| 1: 'عصبانی', | |
| 2: ' خنثی', | |
| 3: 'مثبت', | |
| 4: ' بسیار مثبت' | |
| } | |
| # Adding sentiment prediction to tokenized dataset | |
| def predict_sentiment_labels(text): | |
| dataset = Dataset.from_dict({"Comment": [text]}) | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
| predicted_sentiments = tokenized_dataset.map(predict_sentiment, batched=True) | |
| sentiment = predicted_sentiments[0]['sentiment'] | |
| return sentiment_labels.get(sentiment, 'نامشخص') | |
| # Functions from your original code for classifying sentence type and cleaning | |
| imperative_verbs = [ | |
| 'بیا', 'برو', 'بخواب', 'کن', 'باش', 'بذار', 'فراموش کن', 'بخور', | |
| 'بپوش', 'ببخش', 'بنویس', 'دقت کن', 'دست بردار', 'سکوت کن', | |
| 'اجازه بده', 'نکن', 'پیش برو', 'خواب بمان', 'توجه کن', 'خوش آمدید', | |
| 'حواسجمع باش', 'در نظر بگیر', 'بخشید', 'بکش', 'نگذار', 'سعی کن', | |
| 'تلاش کن', 'ببین', 'نرو', 'بگیر', 'بگو', 'شک نکن', 'فکر کن', | |
| 'عادت کن', 'بیانداز', 'حرکت کن', 'شکایت نکن', 'عاشق شو', 'بخند', | |
| 'برگرد', 'بزن', 'آشپزی کن', 'بپذیر', 'شیرینی بپز', 'درس بخوان', | |
| 'کلاس بگذار', 'کمک کن', 'بمان', 'راهنمایی کن', 'لطفا' | |
| ] | |
| def classify_sentence(sentence): | |
| sentence = sentence.strip() | |
| sentence_type = 'خبری' | |
| if re.search(r'چرا|چطور|کجا|آیا|چه|چی|چند|کدام|کی|چندم|چیست|چیه|چندمین|چجوری|کی|چیست|چگونه|؟', sentence) or sentence.endswith('?'): | |
| sentence_type = 'پرسشی' | |
| elif re.search(r'\b(?:' + '|'.join(imperative_verbs) + r')\b', sentence): | |
| sentence_type = 'امری' | |
| return sentence_type | |
| def clean_text(text): | |
| text = re.sub(r'https://\S+|www\.\S+', '', text) | |
| text = re.sub(r'[^ا-ی0-9\s#@_؟]', ' ', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| words = word_tokenize(text) | |
| #words = [word for word in words if word not in stopwords] | |
| #words = [lemmatizer.lemmatize(word) for word in words] | |
| return ' '.join(words) | |
| def process_sentence(sentence): | |
| cleaned = clean_text(sentence) | |
| sentence_type = classify_sentence(cleaned) | |
| sentiment = predict_sentiment_labels(sentence) | |
| return f"Type: {sentence_type}\nSentiment: {sentiment}\nCleaned Text: {cleaned}" | |
| # ایجاد پوشه برای ذخیره فایلها در صورت نبود آن | |
| output_folder = "./outputs" | |
| if not os.path.exists(output_folder): | |
| os.makedirs(output_folder) | |
| # Function to process file | |
| def process_file(file): | |
| try: | |
| df = pd.read_csv(file.name) | |
| if 'Comment' not in df.columns: | |
| return "Error: No 'Comment' column found in the file." | |
| # Process comments | |
| df['Cleaned_Comment'] = df['Comment'].apply(clean_text) | |
| df['Type'] = df['Comment'].apply(classify_sentence) | |
| df['Sentiment'] = df['Comment'].apply(predict_sentiment_labels) | |
| processed_file_path = os.path.join(output_folder, "processed_file.csv") | |
| df.to_csv(processed_file_path, index=False,encoding='utf-8-sig') | |
| return processed_file_path | |
| except Exception as e: | |
| return str(e) |