Spaces:

F-allahmoradi
/

SocialAnalyzer

Sleeping

App Files Files Community

SocialAnalyzer / model.py

F-allahmoradi

Update model.py

297b054 verified 12 months ago

raw

history blame contribute delete

5.2 kB

	# -- coding: utf-8 --
	"""model.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1lKXL4Cdum5DiSbczUsadXc0F8j46NM_m

	# in the name of allah
	"""

	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from transformers import AutoTokenizer, BertForSequenceClassification,AutoConfig
	from datasets import Dataset
	import pandas as pd
	import os
	import re
	from hazm import Normalizer, Lemmatizer, word_tokenize, stopwords_list

	# Initialize Hazm components
	normalizer = Normalizer()
	lemmatizer = Lemmatizer()
	stopwords = stopwords_list()

	# Load the BERT model for sentiment analysis
	dataset = Dataset.from_pandas(pd.DataFrame({"Comment": []}))


	# بارگذاری مدل و توکنایزر
	model_name = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-multi"
	model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# انتخاب دستگاه (GPU یا CPU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	# Tokenization function for sentiment analysis
	def tokenize_function(examples):
	return tokenizer(examples["Comment"], padding="max_length", truncation=True, max_length=128, return_tensors='pt')

	# Sentiment prediction function
	def predict_sentiment(batch):
	input_ids = torch.tensor(batch['input_ids']).to(device)
	attention_mask = torch.tensor(batch['attention_mask']).to(device)

	with torch.no_grad():
	outputs = model(input_ids=input_ids, attention_mask=attention_mask)
	predictions = torch.argmax(outputs.logits, dim=-1)

	return {'sentiment': predictions.cpu()}

	# Mapping sentiment labels
	sentiment_labels = {
	0: 'بسیار عصیانی',
	1: 'عصبانی',
	2: ' خنثی',
	3: 'مثبت',
	4: ' بسیار مثبت'
	}

	# Adding sentiment prediction to tokenized dataset
	def predict_sentiment_labels(text):
	dataset = Dataset.from_dict({"Comment": [text]})
	tokenized_dataset = dataset.map(tokenize_function, batched=True)
	predicted_sentiments = tokenized_dataset.map(predict_sentiment, batched=True)
	sentiment = predicted_sentiments[0]['sentiment']
	return sentiment_labels.get(sentiment, 'نامشخص')



	# Functions from your original code for classifying sentence type and cleaning
	imperative_verbs = [
	'بیا', 'برو', 'بخواب', 'کن', 'باش', 'بذار', 'فراموش کن', 'بخور',
	'بپوش', 'ببخش', 'بنویس', 'دقت کن', 'دست بردار', 'سکوت کن',
	'اجازه بده', 'نکن', 'پیش برو', 'خواب بمان', 'توجه کن', 'خوش آمدید',
	'حواس‌جمع باش', 'در نظر بگیر', 'بخشید', 'بکش', 'نگذار', 'سعی کن',
	'تلاش کن', 'ببین', 'نرو', 'بگیر', 'بگو', 'شک نکن', 'فکر کن',
	'عادت کن', 'بیانداز', 'حرکت کن', 'شکایت نکن', 'عاشق شو', 'بخند',
	'برگرد', 'بزن', 'آشپزی کن', 'بپذیر', 'شیرینی بپز', 'درس بخوان',
	'کلاس بگذار', 'کمک کن', 'بمان', 'راهنمایی کن', 'لطفا'
	]

	def classify_sentence(sentence):
	sentence = sentence.strip()
	sentence_type = 'خبری'

	if re.search(r'چرا\|چطور\|کجا\|آیا\|چه\|چی\|چند\|کدام\|کی\|چندم\|چیست\|چیه\|چندمین\|چجوری\|کی\|چیست\|چگونه\|؟', sentence) or sentence.endswith('?'):
	sentence_type = 'پرسشی'
	elif re.search(r'\b(?:' + '\|'.join(imperative_verbs) + r')\b', sentence):
	sentence_type = 'امری'

	return sentence_type

	def clean_text(text):
	text = re.sub(r'https://\S+\|www\.\S+', '', text)
	text = re.sub(r'[^ا-ی0-9\s#@_؟]', ' ', text)
	text = re.sub(r'\s+', ' ', text).strip()
	words = word_tokenize(text)
	#words = [word for word in words if word not in stopwords]
	#words = [lemmatizer.lemmatize(word) for word in words]
	return ' '.join(words)


	def process_sentence(sentence):
	cleaned = clean_text(sentence)
	sentence_type = classify_sentence(cleaned)
	sentiment = predict_sentiment_labels(sentence)
	return f"Type: {sentence_type}\nSentiment: {sentiment}\nCleaned Text: {cleaned}"


	# ایجاد پوشه برای ذخیره فایل‌ها در صورت نبود آن
	output_folder = "./outputs"
	if not os.path.exists(output_folder):
	os.makedirs(output_folder)


	# Function to process file
	def process_file(file):
	try:
	df = pd.read_csv(file.name)
	if 'Comment' not in df.columns:
	return "Error: No 'Comment' column found in the file."

	# Process comments
	df['Cleaned_Comment'] = df['Comment'].apply(clean_text)
	df['Type'] = df['Comment'].apply(classify_sentence)
	df['Sentiment'] = df['Comment'].apply(predict_sentiment_labels)

	processed_file_path = os.path.join(output_folder, "processed_file.csv")


	df.to_csv(processed_file_path, index=False,encoding='utf-8-sig')
	return processed_file_path
	except Exception as e:
	return str(e)