Spaces:

mronatkaya
/

eksiSUMM

Sleeping

eksiSUMM / functions.py

Onat Kaya

fixed minor syntax issue, regarding api_key

91f3eff 12 months ago

8.22 kB

	from bs4 import BeautifulSoup
	from urllib.request import Request, urlopen
	import re
	from openai import OpenAI
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
	from collections import Counter
	from peft import PeftModel, PeftConfig
	import matplotlib.pyplot as plt
	import io
	from PIL import Image
	import os

	api_key = os.environ.get("OPENAI_KEY")

	def check_multi_page(url_main):
	"""
	Checks whether the page of the title is multi-paged (contains a page counter),
	or single-paged (does not contain a page counter). Returns a boolean, TRUE for multi-paged situations.

	Also: returning the total number of pages (it is 1 for single pages).

	RETURN: tuple containing a boolean and an int.
	"""

	headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}
	req = Request(url_main, headers=headers1)
	webpage=urlopen(req).read()
	soup = BeautifulSoup(webpage, 'html.parser')

	try:
	count = soup.find_all('div', class_ = "pager")[0]
	temp = str(count)
	data_pagecount_index = temp.index("data-pagecount")
	first_comma = temp.index("\"", data_pagecount_index)
	second_comma = temp.index("\"", first_comma+1)
	pagecount_int = int(temp[first_comma+1:second_comma])
	print(f"This title contains {pagecount_int} pages.")
	return True, pagecount_int # it is multi-paged.
	except:
	print("This title only contains 1 page.")
	return False, 1 # it is single-paged.

	def single_page_scrape(url):
	"""
	Scraping all the entries from a single URL page.

	RETURN: A list of strings. Each string represents a post from the single page specified.
	"""
	headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}
	req = Request(url, headers=headers1)
	webpage=urlopen(req).read()
	soup = BeautifulSoup(webpage, 'html.parser')
	entries = soup.find_all('div', class_ ='content') # have all entries in an iterable (raw, needs further processing)
	entry_list = [a.text.strip() for a in entries] # going through each entry (processing). cleaning it by using .text attribute.
	return entry_list

	def get_page_title(url):
	"""
	In EksiSozluk, there is a title for the pages.

	This function returns the title, in str format.
	"""
	headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}
	req = Request(url, headers=headers1)
	webpage=urlopen(req).read()
	soup = BeautifulSoup(webpage, 'html.parser')
	title = soup.find('span', itemprop='name').text # have all entries in an iterable (raw, needs further processing)
	return str(title)

	def all_pages_scrape(url_main):
	"""
	Scraping all the entries from all pages.

	RETURN: A list strings. Each string represents a post. Scraped through all pages possible.
	"""
	multi_page_bool, num_pages = check_multi_page(url_main)
	all_entries = []
	if(multi_page_bool == True):
	for page in range(num_pages):
	print(f"Scraping page {page+1}...")
	temp_url = url_main + "?p=" + str(page+1)
	temp_entries = single_page_scrape(temp_url)
	all_entries.extend(temp_entries)
	else:
	all_entries.extend(single_page_scrape(url_main))

	print("Scraping EksiSozluk entries is completed!")
	return all_entries

	# Using OpenAI API, for summarization
	def get_completion(prompt, tokens_create, model="gpt-4o-mini"):
	client = OpenAI(api_key=api_key)
	messages = [{"role": "user", "content": prompt}]
	response = client.responses.create(
	model=model,
	input=messages,
	temperature=0.1, # this is the degree of randomness of the model's output
	max_output_tokens=tokens_create
	)
	return(response.output_text)

	def create_pie_chart(positives, neutrals, negatives):
	labels = ['positive', 'neutral', 'negative']
	sizes = [positives, neutrals, negatives]
	colors = ['lightgreen', 'skyblue', 'salmon']
	explode = (0.1, 0.1, 0.1) # Highlight students

	fig, ax = plt.subplots(figsize=(6, 6))
	ax.pie(sizes, colors=colors, explode=explode, startangle=140, autopct=lambda p: f'{p:.1f}%' if p > 0 else '')
	ax.legend(labels, loc="best")
	ax.set_title('Sentiment Analysis Results')
	ax.axis('equal')

	# Put the figure through a buffer, and then convert it to a Image
	im_buf = io.BytesIO()
	plt.savefig(im_buf, format='png')
	plt.close(fig)
	im_buf.seek(0)
	image = Image.open(im_buf)
	return image

	# https://huggingface.co/VRLLab/TurkishBERTweet
	def sentiment_analysis(entries_list):
	print("Conducting Sentiment Analysis on Posts...")
	peft_model = "VRLLab/TurkishBERTweet-Lora-SA"
	peft_config = PeftConfig.from_pretrained(peft_model)

	# loading Tokenizer
	padding_side = "right"
	tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, padding_side=padding_side)
	if getattr(tokenizer, "pad_token_id") is None:
	tokenizer.pad_token_id = tokenizer.eos_token_id

	id2label_sa = {0: "negative", 2: "positive", 1: "neutral"}
	turkishBERTweet_sa = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, num_labels=len(id2label_sa), id2label=id2label_sa)
	turkishBERTweet_sa = PeftModel.from_pretrained(turkishBERTweet_sa, peft_model)

	label_list = []
	with torch.no_grad():
	for s in entries_list:
	ids = tokenizer.encode_plus(s, return_tensors="pt")
	label_id = turkishBERTweet_sa(**ids).logits.argmax(-1).item()
	label_list.append(id2label_sa[label_id])
	counter_list = Counter(label_list)
	result = f"Total Entries Considered (most recent): {len(label_list)}\n\n\tPositive posts: {counter_list['positive']}\n\tNeutral Posts: {counter_list['neutral']}\n\tNegative Posts: {counter_list['negative']}"
	image = create_pie_chart(counter_list['positive'], counter_list['neutral'], counter_list['negative'])
	return result, image

	def getSummary(url_main, tokens_create, sentiment, lang="English"):
	url_title = get_page_title(url_main)
	print(f"Title is extracted: {url_title}")
	print(f"Starting to scrape EksiSozluk entries for the title '{url_title}'...")
	entries_list = all_pages_scrape(url_main)

	print(f"Generating the summary...\n")
	prompt = f"""
	You are going to be a presented a list of strings below. Each string in the list is in Turkish. \
	These strings are scraped from a Turkish forum that resembles Reddit, called Ekşi Sözlük. \
	Each string in the list represents a post, under a specified title. The list of strings will be specified under single quotations. \
	The title representing the topic of the posts will also be given below as well (under single quotations). \
	Summarize what is being said in these posts overall, for someone who does not know anything neither about the posts nor the title. \
	Write the summary in {lang}. Use bullet points for better clarity. Please do not have incomplete sentence(s) in the output.

	Title: '{url_title}'

	List of strings (posts): '{entries_list}'

	"""
	if(sentiment==False):
	response = get_completion(prompt=prompt, tokens_create=tokens_create)
	response2 = f"Total number of entries considered: {len(entries_list)}\n" + response
	return response2, "--", None
	else:
	response = get_completion(prompt=prompt, tokens_create=tokens_create)
	response2 = f"Total number of entries considered: {len(entries_list)}\n" + response
	try:
	sentiment_result, image = sentiment_analysis(entries_list)
	except: # gives error if >= 250 entries. --> but gave an error in [-245:]?.
	print("*Entered Exception for Sentiment Analysis...*")
	sentiment_result, image = sentiment_analysis(entries_list[-200:])
	return response2, sentiment_result, image