Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| from urllib.request import Request, urlopen | |
| import re | |
| from openai import OpenAI | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel | |
| from collections import Counter | |
| from peft import PeftModel, PeftConfig | |
| import matplotlib.pyplot as plt | |
| import io | |
| from PIL import Image | |
| import os | |
| api_key = os.environ.get("OPENAI_KEY") | |
| def check_multi_page(url_main): | |
| """ | |
| Checks whether the page of the title is multi-paged (contains a page counter), | |
| or single-paged (does not contain a page counter). Returns a boolean, TRUE for multi-paged situations. | |
| Also: returning the total number of pages (it is 1 for single pages). | |
| RETURN: tuple containing a boolean and an int. | |
| """ | |
| headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'} | |
| req = Request(url_main, headers=headers1) | |
| webpage=urlopen(req).read() | |
| soup = BeautifulSoup(webpage, 'html.parser') | |
| try: | |
| count = soup.find_all('div', class_ = "pager")[0] | |
| temp = str(count) | |
| data_pagecount_index = temp.index("data-pagecount") | |
| first_comma = temp.index("\"", data_pagecount_index) | |
| second_comma = temp.index("\"", first_comma+1) | |
| pagecount_int = int(temp[first_comma+1:second_comma]) | |
| print(f"This title contains {pagecount_int} pages.") | |
| return True, pagecount_int # it is multi-paged. | |
| except: | |
| print("This title only contains 1 page.") | |
| return False, 1 # it is single-paged. | |
| def single_page_scrape(url): | |
| """ | |
| Scraping all the entries from a single URL page. | |
| RETURN: A list of strings. Each string represents a post from the single page specified. | |
| """ | |
| headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'} | |
| req = Request(url, headers=headers1) | |
| webpage=urlopen(req).read() | |
| soup = BeautifulSoup(webpage, 'html.parser') | |
| entries = soup.find_all('div', class_ ='content') # have all entries in an iterable (raw, needs further processing) | |
| entry_list = [a.text.strip() for a in entries] # going through each entry (processing). cleaning it by using .text attribute. | |
| return entry_list | |
| def get_page_title(url): | |
| """ | |
| In EksiSozluk, there is a title for the pages. | |
| This function returns the title, in str format. | |
| """ | |
| headers1 = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'} | |
| req = Request(url, headers=headers1) | |
| webpage=urlopen(req).read() | |
| soup = BeautifulSoup(webpage, 'html.parser') | |
| title = soup.find('span', itemprop='name').text # have all entries in an iterable (raw, needs further processing) | |
| return str(title) | |
| def all_pages_scrape(url_main): | |
| """ | |
| Scraping all the entries from all pages. | |
| RETURN: A list strings. Each string represents a post. Scraped through all pages possible. | |
| """ | |
| multi_page_bool, num_pages = check_multi_page(url_main) | |
| all_entries = [] | |
| if(multi_page_bool == True): | |
| for page in range(num_pages): | |
| print(f"Scraping page {page+1}...") | |
| temp_url = url_main + "?p=" + str(page+1) | |
| temp_entries = single_page_scrape(temp_url) | |
| all_entries.extend(temp_entries) | |
| else: | |
| all_entries.extend(single_page_scrape(url_main)) | |
| print("Scraping EksiSozluk entries is completed!") | |
| return all_entries | |
| # Using OpenAI API, for summarization | |
| def get_completion(prompt, tokens_create, model="gpt-4o-mini"): | |
| client = OpenAI(api_key=api_key) | |
| messages = [{"role": "user", "content": prompt}] | |
| response = client.responses.create( | |
| model=model, | |
| input=messages, | |
| temperature=0.1, # this is the degree of randomness of the model's output | |
| max_output_tokens=tokens_create | |
| ) | |
| return(response.output_text) | |
| def create_pie_chart(positives, neutrals, negatives): | |
| labels = ['positive', 'neutral', 'negative'] | |
| sizes = [positives, neutrals, negatives] | |
| colors = ['lightgreen', 'skyblue', 'salmon'] | |
| explode = (0.1, 0.1, 0.1) # Highlight students | |
| fig, ax = plt.subplots(figsize=(6, 6)) | |
| ax.pie(sizes, colors=colors, explode=explode, startangle=140, autopct=lambda p: f'{p:.1f}%' if p > 0 else '') | |
| ax.legend(labels, loc="best") | |
| ax.set_title('Sentiment Analysis Results') | |
| ax.axis('equal') | |
| # Put the figure through a buffer, and then convert it to a Image | |
| im_buf = io.BytesIO() | |
| plt.savefig(im_buf, format='png') | |
| plt.close(fig) | |
| im_buf.seek(0) | |
| image = Image.open(im_buf) | |
| return image | |
| # https://huggingface.co/VRLLab/TurkishBERTweet | |
| def sentiment_analysis(entries_list): | |
| print("Conducting Sentiment Analysis on Posts...") | |
| peft_model = "VRLLab/TurkishBERTweet-Lora-SA" | |
| peft_config = PeftConfig.from_pretrained(peft_model) | |
| # loading Tokenizer | |
| padding_side = "right" | |
| tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, padding_side=padding_side) | |
| if getattr(tokenizer, "pad_token_id") is None: | |
| tokenizer.pad_token_id = tokenizer.eos_token_id | |
| id2label_sa = {0: "negative", 2: "positive", 1: "neutral"} | |
| turkishBERTweet_sa = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, num_labels=len(id2label_sa), id2label=id2label_sa) | |
| turkishBERTweet_sa = PeftModel.from_pretrained(turkishBERTweet_sa, peft_model) | |
| label_list = [] | |
| with torch.no_grad(): | |
| for s in entries_list: | |
| ids = tokenizer.encode_plus(s, return_tensors="pt") | |
| label_id = turkishBERTweet_sa(**ids).logits.argmax(-1).item() | |
| label_list.append(id2label_sa[label_id]) | |
| counter_list = Counter(label_list) | |
| result = f"Total Entries Considered (most recent): {len(label_list)}\n\n\tPositive posts: {counter_list['positive']}\n\tNeutral Posts: {counter_list['neutral']}\n\tNegative Posts: {counter_list['negative']}" | |
| image = create_pie_chart(counter_list['positive'], counter_list['neutral'], counter_list['negative']) | |
| return result, image | |
| def getSummary(url_main, tokens_create, sentiment, lang="English"): | |
| url_title = get_page_title(url_main) | |
| print(f"Title is extracted: {url_title}") | |
| print(f"Starting to scrape EksiSozluk entries for the title '{url_title}'...") | |
| entries_list = all_pages_scrape(url_main) | |
| print(f"Generating the summary...\n") | |
| prompt = f""" | |
| You are going to be a presented a list of strings below. Each string in the list is in Turkish. \ | |
| These strings are scraped from a Turkish forum that resembles Reddit, called Ekşi Sözlük. \ | |
| Each string in the list represents a post, under a specified title. The list of strings will be specified under single quotations. \ | |
| The title representing the topic of the posts will also be given below as well (under single quotations). \ | |
| Summarize what is being said in these posts overall, for someone who does not know anything neither about the posts nor the title. \ | |
| Write the summary in {lang}. Use bullet points for better clarity. Please do not have incomplete sentence(s) in the output. | |
| Title: '{url_title}' | |
| List of strings (posts): '{entries_list}' | |
| """ | |
| if(sentiment==False): | |
| response = get_completion(prompt=prompt, tokens_create=tokens_create) | |
| response2 = f"Total number of entries considered: {len(entries_list)}\n" + response | |
| return response2, "--", None | |
| else: | |
| response = get_completion(prompt=prompt, tokens_create=tokens_create) | |
| response2 = f"Total number of entries considered: {len(entries_list)}\n" + response | |
| try: | |
| sentiment_result, image = sentiment_analysis(entries_list) | |
| except: # gives error if >= 250 entries. --> but gave an error in [-245:]?. | |
| print("***Entered Exception for Sentiment Analysis...***") | |
| sentiment_result, image = sentiment_analysis(entries_list[-200:]) | |
| return response2, sentiment_result, image |