eksiSUMM / functions.py
Onat Kaya
fixed minor syntax issue, regarding api_key
91f3eff
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
from openai import OpenAI
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from collections import Counter
from peft import PeftModel, PeftConfig
import matplotlib.pyplot as plt
import io
from PIL import Image
import os
api_key = os.environ.get("OPENAI_KEY")
def check_multi_page(url_main):
"""
Checks whether the page of the title is multi-paged (contains a page counter),
or single-paged (does not contain a page counter). Returns a boolean, TRUE for multi-paged situations.
Also: returning the total number of pages (it is 1 for single pages).
RETURN: tuple containing a boolean and an int.
"""
headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}
req = Request(url_main, headers=headers1)
webpage=urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
try:
count = soup.find_all('div', class_ = "pager")[0]
temp = str(count)
data_pagecount_index = temp.index("data-pagecount")
first_comma = temp.index("\"", data_pagecount_index)
second_comma = temp.index("\"", first_comma+1)
pagecount_int = int(temp[first_comma+1:second_comma])
print(f"This title contains {pagecount_int} pages.")
return True, pagecount_int # it is multi-paged.
except:
print("This title only contains 1 page.")
return False, 1 # it is single-paged.
def single_page_scrape(url):
"""
Scraping all the entries from a single URL page.
RETURN: A list of strings. Each string represents a post from the single page specified.
"""
headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}
req = Request(url, headers=headers1)
webpage=urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
entries = soup.find_all('div', class_ ='content') # have all entries in an iterable (raw, needs further processing)
entry_list = [a.text.strip() for a in entries] # going through each entry (processing). cleaning it by using .text attribute.
return entry_list
def get_page_title(url):
"""
In EksiSozluk, there is a title for the pages.
This function returns the title, in str format.
"""
headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}
req = Request(url, headers=headers1)
webpage=urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
title = soup.find('span', itemprop='name').text # have all entries in an iterable (raw, needs further processing)
return str(title)
def all_pages_scrape(url_main):
"""
Scraping all the entries from all pages.
RETURN: A list strings. Each string represents a post. Scraped through all pages possible.
"""
multi_page_bool, num_pages = check_multi_page(url_main)
all_entries = []
if(multi_page_bool == True):
for page in range(num_pages):
print(f"Scraping page {page+1}...")
temp_url = url_main + "?p=" + str(page+1)
temp_entries = single_page_scrape(temp_url)
all_entries.extend(temp_entries)
else:
all_entries.extend(single_page_scrape(url_main))
print("Scraping EksiSozluk entries is completed!")
return all_entries
# Using OpenAI API, for summarization
def get_completion(prompt, tokens_create, model="gpt-4o-mini"):
client = OpenAI(api_key=api_key)
messages = [{"role": "user", "content": prompt}]
response = client.responses.create(
model=model,
input=messages,
temperature=0.1, # this is the degree of randomness of the model's output
max_output_tokens=tokens_create
)
return(response.output_text)
def create_pie_chart(positives, neutrals, negatives):
labels = ['positive', 'neutral', 'negative']
sizes = [positives, neutrals, negatives]
colors = ['lightgreen', 'skyblue', 'salmon']
explode = (0.1, 0.1, 0.1) # Highlight students
fig, ax = plt.subplots(figsize=(6, 6))
ax.pie(sizes, colors=colors, explode=explode, startangle=140, autopct=lambda p: f'{p:.1f}%' if p > 0 else '')
ax.legend(labels, loc="best")
ax.set_title('Sentiment Analysis Results')
ax.axis('equal')
# Put the figure through a buffer, and then convert it to a Image
im_buf = io.BytesIO()
plt.savefig(im_buf, format='png')
plt.close(fig)
im_buf.seek(0)
image = Image.open(im_buf)
return image
# https://huggingface.co/VRLLab/TurkishBERTweet
def sentiment_analysis(entries_list):
print("Conducting Sentiment Analysis on Posts...")
peft_model = "VRLLab/TurkishBERTweet-Lora-SA"
peft_config = PeftConfig.from_pretrained(peft_model)
# loading Tokenizer
padding_side = "right"
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
id2label_sa = {0: "negative", 2: "positive", 1: "neutral"}
turkishBERTweet_sa = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, num_labels=len(id2label_sa), id2label=id2label_sa)
turkishBERTweet_sa = PeftModel.from_pretrained(turkishBERTweet_sa, peft_model)
label_list = []
with torch.no_grad():
for s in entries_list:
ids = tokenizer.encode_plus(s, return_tensors="pt")
label_id = turkishBERTweet_sa(**ids).logits.argmax(-1).item()
label_list.append(id2label_sa[label_id])
counter_list = Counter(label_list)
result = f"Total Entries Considered (most recent): {len(label_list)}\n\n\tPositive posts: {counter_list['positive']}\n\tNeutral Posts: {counter_list['neutral']}\n\tNegative Posts: {counter_list['negative']}"
image = create_pie_chart(counter_list['positive'], counter_list['neutral'], counter_list['negative'])
return result, image
def getSummary(url_main, tokens_create, sentiment, lang="English"):
url_title = get_page_title(url_main)
print(f"Title is extracted: {url_title}")
print(f"Starting to scrape EksiSozluk entries for the title '{url_title}'...")
entries_list = all_pages_scrape(url_main)
print(f"Generating the summary...\n")
prompt = f"""
You are going to be a presented a list of strings below. Each string in the list is in Turkish. \
These strings are scraped from a Turkish forum that resembles Reddit, called Ekşi Sözlük. \
Each string in the list represents a post, under a specified title. The list of strings will be specified under single quotations. \
The title representing the topic of the posts will also be given below as well (under single quotations). \
Summarize what is being said in these posts overall, for someone who does not know anything neither about the posts nor the title. \
Write the summary in {lang}. Use bullet points for better clarity. Please do not have incomplete sentence(s) in the output.
Title: '{url_title}'
List of strings (posts): '{entries_list}'
"""
if(sentiment==False):
response = get_completion(prompt=prompt, tokens_create=tokens_create)
response2 = f"Total number of entries considered: {len(entries_list)}\n" + response
return response2, "--", None
else:
response = get_completion(prompt=prompt, tokens_create=tokens_create)
response2 = f"Total number of entries considered: {len(entries_list)}\n" + response
try:
sentiment_result, image = sentiment_analysis(entries_list)
except: # gives error if >= 250 entries. --> but gave an error in [-245:]?.
print("***Entered Exception for Sentiment Analysis...***")
sentiment_result, image = sentiment_analysis(entries_list[-200:])
return response2, sentiment_result, image