Upload 2 files

f1e5b3d verified over 1 year ago

14.7 kB

	import multiprocessing
	import os
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	import re
	import string
	import nltk
	import time
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')
	nltk.download('cmudict')
	from nltk.corpus import stopwords
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import cmudict

	folderpath = r'C:\Users/suwes/SentimentEngine/'
	textfile_path = f"{folderpath}inputtext/"
	stopword_path = f"{folderpath}StopWords/"
	masterdict_path = f"{folderpath}MasterDictionary/"

	def createdf():
	inputxlsx = os.path.join(folderpath, "Input.xlsx")
	dfxlsx = pd.read_excel(inputxlsx)
	print(dfxlsx)
	df_urls = dfxlsx['URL']
	#print(df_urls)
	return dfxlsx

	df = createdf()

	def extract(df):
	#extracting article text from urls
	def extract_urltext(url):
	response = requests.get(url)#send GET req to url
	soup = BeautifulSoup(response.content, 'html.parser')
	article_title = soup.find('title').get_text()#find and extract tile of article
	article_content = soup.find('div', class_= 'td-pb-span8 td-main-content')#find and extract article text
	article_text = ''
	if article_content:
	for para in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
	article_text += para.get_text()
	#print(article_title)
	#print(article_text)
	return article_title, article_text

	#url = 'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040/'
	#extract_urltext(url)
	#article_title, article_text = extract_urltext(url)

	for index, row in df.iterrows():
	url = row['URL']
	url_id = row['URL_ID']
	article_title, article_text = extract_urltext(url)
	#save text to file
	filename = f"{folderpath}inputtext/{url_id}.txt"
	with open(filename, 'w', encoding = 'utf-8') as file:
	file.write(article_title+ '\n\n' +article_text)
	print(f"text saved to file {filename}")

	#extract data
	extract(df)

	def transform(df):
	#cleaning stop words
	#reading stop words from stopword files
	def read_stopwords(stopword_folder):
	stopwords = set()
	filenames = os.listdir(stopword_folder)
	# process each file
	for filename in filenames:
	filepath = os.path.join(stopword_folder, filename)
	#read stop words from each file
	with open(filepath, 'r', encoding= 'utf-8', errors='ignore') as file:
	stopwords.update(map(str.strip, file.readlines()))
	return stopwords
	#stop words
	stopwords = read_stopwords(stopword_path)

	#cleaning stop words from text
	def clean_stopwords(text, stopwords):
	#tokenize text
	words = word_tokenize(text)
	#remove stop words from text
	cleaned_words = [word for word in words if word.lower() not in stopwords]
	#reconstructing cleaned text
	cleaned_text = ' '.join(cleaned_words)
	return cleaned_text

	#cleaning stop words from a directory/multiple files
	def clean_stopwords_directory(directory, stopwords):
	#list all files in directory
	filenames = os.listdir(directory)
	#cleaning each file
	for filename in filenames:
	filepath = os.path.join(directory, filename)
	#read text from each file
	with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
	text = file.read()
	#clean stop words from file text
	cleaned_text = clean_stopwords(text, stopwords)
	#write back cleaned text
	with open(filepath, 'w', encoding= 'utf-8', errors='ignore') as file:
	file.write(cleaned_text)
	print(f"cleaned text from {filename}")

	clean_stopwords_directory(textfile_path, stopwords)
	#creating dictionary of positive and negative words
	def create_posneg_dict(masterdict_path, stopwords):
	poswords = set()
	negwords = set()
	#read positivewords file
	with open(os.path.join(masterdict_path, 'positive-words.txt'), 'r', encoding='utf-8', errors='ignore') as file:
	for line in file:
	words = line.strip().split()
	for word in words:
	if word.lower() not in stopwords:
	poswords.add(word.lower())
	#read negativewords file
	with open(os.path.join(masterdict_path, 'negative-words.txt'), 'r', encoding='utf-8', errors='ignore') as file:
	for line in file:
	words = line.strip().split()
	for word in words:
	if word.lower() not in stopwords:
	negwords.add(word.lower())
	return poswords, negwords

	positivewords, negativewords = create_posneg_dict(masterdict_path, stopwords)
	#print(positivewords)
	#print(negativewords)
	return stopwords, positivewords, negativewords

	#cleaning/transforming data
	stopwords, positivewords, negativewords = transform(df)

	#load data
	result_df = pd.DataFrame()
	def loadoutput(folderpath, result_df):
	exceloutfilepath = f"{folderpath}Output.xlsx"
	result_df.to_excel(exceloutfilepath, index=False)
	print(f"output file saved to {exceloutfilepath}")
	print(f"analysis time: {int((time.time() - starttime)//3600)} hours {int(((time.time() - starttime)%3600)//60)} minutes {int((time.time() - starttime)%60)} seconds")

	#process text files
	def runengine(df, stopwords, files_subset, dflist):
	#sentimental analysis
	#calculating variables
	def calculate_positivescore(words, positivewords):
	positivescore = sum(1 for word in words if word.lower() in positivewords)
	return positivescore

	def calculate_negativescore(words, negativewords):
	negativescore = (sum(-1 for word in words if word.lower() in negativewords))*(-1)
	return negativescore

	#analysis of readability
	def calc_readibility(words, sentences):
	#calculate average length of sentences
	avg_sentencelen = len(words)/len(sentences) if sentences else 0
	#calculate % of complex words
	complexwords = [word for word in words if syllable_count(word)>2]
	percent_complexwords = len(complexwords)/len(words)*100 if words else 0
	#calculate fog index
	fog_index = 0.4*(avg_sentencelen + percent_complexwords)
	return avg_sentencelen, percent_complexwords, fog_index

	#average words per text
	def avg_wordspersentence(words, sentences):
	if len(sentences) > 0:
	averagewords = len(words)/len(sentences)
	return averagewords
	else: return 0

	#complex word count
	def syllable_count(word):
	d = cmudict.dict()
	if word.lower() in d:
	return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
	else:
	return 0
	def complexwords_count(words):
	complexwords = [word for word in words if syllable_count(word)>2]
	return len(complexwords)

	#clean words count
	def cleanwords_count(words, stopwords):
	punctuations = set(string.punctuation)
	cleaned_words = [word.lower() for word in words if word.lower() not in stopwords and word.lower() not in punctuations]
	return len(cleaned_words)

	#syllable count per word
	#vowel syllable count per word
	def vowel_syllable(word):
	vowels = 'aeiouy'
	count = 0
	endings = 'es', 'ed', 'e'
	#exceptions for word with endings
	word = word.lower().strip()
	if word.endswith(endings):
	word = word[:-2]#subtract 2 characters from ending of word
	elif word.emdswith('le'):
	word = word[:-2]
	endings = ''
	elif word.endswith('ing'):
	word = word[:-3]#subtract 3 characters from ending of word
	endings = ''
	#counting vowels in word
	if len(word)<=3:
	return 1
	for index, letter in enumerate(word):
	if letter in vowels and (index ==0 or word[index -1] not in vowels):
	count +=1
	#handling y as vowel at end of word
	if word.endswith('y') and word[-2] not in vowels:
	count +=1
	return count
	#per text
	def vowel_syllable_perword(words):
	total_syllables = sum(syllable_count(word) for word in words)
	return total_syllables

	#personal pronouns
	def count_pronouns(text):
	pattern = r'\b(?:I\|we\|my\|ours\|us)\b'#define regex pattern for matching pronouns
	#find all matches
	matches = re.findall(pattern, text, flags=re.IGNORECASE)
	#excluse 'US' when reffering to USA
	matches_fin = [matches for match in matches if match.lower() != 'us']
	countpron = len(matches_fin)#count of pronouns
	return countpron

	#average word length
	def calc_avg_wordlength(words):
	total_chars = sum(len(word) for word in words)#calculate total charactes in text
	total_words = len(words)
	if total_words != 0:
	avg_wordlength = total_chars/total_words
	else: avg_wordlength = 0
	return avg_wordlength

	def appendtodf(url_idkey, calc_values, process_df):
	rowindex = df[df['URL_ID'] == url_idkey].index #get index of row where url_id = url_idkey
	if not rowindex.empty:
	idx_toupdate = rowindex[0]
	# Create a new row with the columns from the original DataFrame df
	new_row = pd.DataFrame(columns=process_df.columns)
	# Assign the existing values from df to the new row at the corresponding index
	new_row.loc[0, process_df.columns[:2]] = df.loc[idx_toupdate, ['URL_ID', 'URL']]
	# Update the new row with the calculated values
	for col, value in calc_values.items():
	new_row[col] = value
	# Add the new row to the process_df
	process_df = process_df._append(new_row, ignore_index=True)
	print(f"Result updated for {url_idkey}")
	else:
	print(f"!not found {url_idkey}")
	return process_df

	#process data/ processing each file
	process_df = pd.DataFrame(columns=df.columns)
	for filename in files_subset:
	filepath = os.path.join(textfile_path, filename)
	#to update values for each URL_ID
	url_idkey = re.search(r'blackassign\d{4}', filepath).group()
	if os.path.isfile(filepath):
	with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
	text = file.read()
	#tokenize text
	words = word_tokenize(text)
	sentences = sent_tokenize(text)
	totalwords = len(words)

	#calculate positive score
	positive_score = calculate_positivescore(words, positivewords)
	print(f"{filename} positive socre: {positive_score}")

	#calculate negative score
	negative_score = calculate_negativescore(words, negativewords)
	print(f"{filename} negative socre: {negative_score}")

	#calculate polarity score
	polarity_score = (positive_score - negative_score)/((positive_score+negative_score)+0.000001)
	print(f"{filename} polarity socre: {polarity_score}")

	#calculate subjective score
	subjectivity_score = (positive_score+negative_score)/((totalwords)+0.000001)
	print(f"{filename} subjectivity socre: {subjectivity_score}")

	#readibility analysis
	avg_sentencelen, percent_complexwords, fog_index = calc_readibility(words, sentences)
	print(f"{filename} avg sentencelength: {avg_sentencelen}")
	#load(df, "AVG SENTENCE LENGTH",avg_sentencelen, url_idkey)
	print(f"{filename} percentage of complex words: {percent_complexwords}")
	#load(df, "PERCENTAGE OF COMPLEX WORDS",percent_complexwords, url_idkey)
	print(f"{filename} Fog Index: {fog_index}")

	#average number of words per sentence
	avg_wordper_sentence = avg_wordspersentence(words, sentences)
	print(f"{filename} avg words per sentence: {avg_wordper_sentence}")

	#complex word count
	complexword_count = complexwords_count(words)
	print(f"{filename} complex words count: {complexword_count}")

	#word count
	cleanword_count = cleanwords_count(words, stopwords)
	print(f"{filename} clean words count: {cleanword_count}")

	#syllable count per word
	syllablecount_perword = vowel_syllable_perword(words)
	print(f"{filename} syllable count per word: {syllablecount_perword}")

	#personal pronouns
	pronouns_count = count_pronouns(text)
	print(f"{filename} personal pronouns count: {pronouns_count}")

	#avg word length
	avg_wordlength = calc_avg_wordlength(words)
	print(f"{filename} avg word length: {avg_wordlength}")
	else: print(f"df not updated for {filename}!")

	calc_values = {
	"POSITIVE SCORE": positive_score,
	"NEGATIVE SCORE": negative_score,
	"POLARITY SCORE": polarity_score,
	"SUBJECTIVITY SCORE": subjectivity_score,
	"AVG SENTENCE LENGTH": avg_sentencelen,
	"PERCENTAGE OF COMPLEX WORDS": percent_complexwords,
	"FOG INDEX": fog_index,
	"AVG NUMBER OF WORDS PER SENTENCE": avg_wordper_sentence,
	"COMPLEX WORD COUNT": complexword_count,
	"WORD COUNT": cleanword_count,
	"SYLLABLE PER WORD": syllablecount_perword,
	"PERSONAL PRONOUNS": pronouns_count,
	"AVG WORD LENGTH": avg_wordlength
	}
	try:
	process_df = appendtodf(url_idkey,calc_values, process_df)
	except Exception as e:
	print(e)
	print(process_df)
	dflist.append(process_df)



	#runengine(df, stopwords, files_subset, dflist)
	if __name__ == '__main__':
	starttime = time.time()
	files_toprocess = os.listdir(textfile_path)
	#files_toprocess = [r'blackassign0049.txt', r'blackassign0099.txt', r'blackassign0100.txt']
	num_processes = multiprocessing.cpu_count()
	print(str(num_processes)+ " CPUs")
	files_perprocess = len(files_toprocess) // num_processes
	print(files_perprocess)

	processes = []
	# Create a Manager object to share a list among processes
	manager = multiprocessing.Manager()
	dflist = manager.list()

	for i in range(num_processes):
	try:
	start = i*files_perprocess
	end = (i+1)*files_perprocess if i != num_processes-1 else len(files_toprocess)
	files_subset = files_toprocess[start:end]

	p = multiprocessing.Process(target=runengine, args =(df, stopwords, files_subset, dflist))
	processes.append(p)
	p.start()
	except Exception as e:
	print(e)

	print("waiting for all processes to end...")
	for i in processes:
	print(i)
	for process in processes:
	try:
	process.join()
	except Exception as e:
	print(e)
	for i in processes:
	print(i)

	print(str(len(dflist))+" result dataframes obtained.")
	result_df = pd.concat(dflist, ignore_index=True)
	result_df = result_df.sort_values(by='URL_ID')
	print(result_df)

	loadoutput(folderpath, result_df)