Ismetdh commited on
Commit
04cdb6f
·
verified ·
1 Parent(s): 5a85659

Upload 12 files

Browse files
README.md CHANGED
@@ -1,10 +1,14 @@
1
- ---
2
- title: README
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: streamlit
7
- pinned: false
8
- ---
9
-
10
- Edit this `README.md` markdown file to author your organization card.
 
 
 
 
 
1
+ ---
2
+ title: FlanT5 Base Fine Tuned For Summarization
3
+ emoji: 🏢
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.40.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: FlanT5_base_fine_tuned_on_CNNDailymail_for_Summarization
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,60 +1,106 @@
1
  import streamlit as st
2
- from streamlit_pills import pills
3
- st.markdown("<h1 style='font-size: 50px;'>QuantumQuest</h1>", unsafe_allow_html=True)
4
-
5
- st.markdown("""
6
- <style>
7
- .category-button {
8
- background-color: #4CAF50; /* Green */
9
- border: none;
10
- width: 150px; /* Set fixed width */
11
- height: 50px; /* Set fixed height */
12
- color: white;
13
- padding: 10px 24px;
14
- text-align: center;
15
- text-decoration: none;
16
- display: inline-block;
17
- font-size: 16px;
18
- margin: 4px 2px;
19
- cursor: pointer;
20
- border-radius: 8px;
21
- transition-duration: 0.4s;
22
- }
23
-
24
- .category-button:hover {
25
- background-color: #3e8e41;
26
- color: white;
27
- }
28
- </style>
29
- """, unsafe_allow_html=True)
30
-
31
-
32
- # List of categories
33
- categories = ["All","International News", "Sports", "Business", "Science/Tech", "Politics", "Entertainment", "Others"]
34
-
35
- num_columns = 3 # Adjust based on how many buttons per row you want
36
- rows = [categories[i:i + num_columns] for i in range(0, len(categories), num_columns)]
37
-
38
- tab1, tab2 = st.tabs([ "Categories","Popular News"])
39
-
40
-
41
-
42
- with tab1:
43
-
44
- with st.container():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  st.header("Categories")
46
- selection = pills("", categories)
47
- st.write(f"You have selected {selection}")
48
-
49
-
50
-
51
-
52
- with tab2:
53
- col1,col2 = st.columns([3,1])
54
- with col1:
55
-
56
- st.header("Recent News")
57
-
58
- with col2:
59
- st.button("Update")
60
- st.write("Demo goes here")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import fetch_top_news as news_api
3
+ import time
4
+
5
+ # Function to load news into session state
6
+ def load_news():
7
+ if "news_data" not in st.session_state:
8
+ # Fetch news if it's not available in session state
9
+ st.session_state.news_data = news_api.get_top_news(language="en")
10
+ st.session_state.last_fetch_time = time.time() # Save the fetch time
11
+
12
+ # Function to check if the news needs to be refreshed
13
+ def should_refresh_data():
14
+ if "last_fetch_time" not in st.session_state:
15
+ return True # Force refresh if no fetch time is found
16
+ time_since_last_fetch = time.time() - st.session_state.last_fetch_time
17
+ return time_since_last_fetch > 10800 # Refresh every 10 minutes
18
+
19
+ # Function to filter news by category
20
+ def filter_news_by_category(news_data, category):
21
+ if category == "All":
22
+ return news_data
23
+ return [news for news in news_data if news['category'] == category]
24
+
25
+ def main():
26
+ st.markdown("<h1 style='font-size: 50px;'>QuantumQuest</h1>", unsafe_allow_html=True)
27
+
28
+ st.markdown("""
29
+ <style>
30
+ .category-button {
31
+ background-color: #4CAF50; /* Green */
32
+ border: none;
33
+ width: 150px; /* Set fixed width */
34
+ height: 50px; /* Set fixed height */
35
+ color: white;
36
+ padding: 10px 24px;
37
+ text-align: center;
38
+ text-decoration: none;
39
+ display: inline-block;
40
+ font-size: 16px;
41
+ margin: 4px 2px;
42
+ cursor: pointer;
43
+ border-radius: 8px;
44
+ transition-duration: 0.4s;
45
+ }
46
+
47
+ .category-button:hover {
48
+ background-color: #3e8e41;
49
+ color: white;
50
+ }
51
+ </style>
52
+ """, unsafe_allow_html=True)
53
+
54
+ # List of categories
55
+ categories = ["All", "Sports", "Business", "Science/Tech", "Politics", "Entertainment", "Others"]
56
+
57
+ # Load or refresh news data
58
+ if should_refresh_data():
59
+ load_news()
60
+
61
+ tab1, tab2, tab3 = st.tabs(["Categories", "Popular News", "Test Area"])
62
+
63
+ # Categories Tab
64
+ with tab1:
65
  st.header("Categories")
66
+
67
+ # Create category selection using st.pills
68
+ selection = st.pills("", categories)
69
+ st.write(f"You have selected: {selection}")
70
+
71
+ # Fetch news data
72
+ if "news_data" in st.session_state:
73
+ news_data = st.session_state.news_data
74
+ filtered_news = filter_news_by_category(news_data, selection)
75
+
76
+ # Display each news in its own bordered container
77
+ for news in filtered_news:
78
+ with st.container(border = True):
79
+ st.markdown(f"**{news['title']}**")
80
+ st.write(news['summary'])
81
+ st.markdown(f"[Read more]({news['url']})")
82
+ st.divider() # Adds a divider between news items
83
+
84
+ # Popular News Tab
85
+ with tab2:
86
+ if "news_data" in st.session_state:
87
+ news_data = st.session_state.news_data
88
+ st.write(f"Fetched {len(news_data)} news articles.")
89
+
90
+ # Display the news articles
91
+ for news in news_data:
92
+ st.subheader(f"{news['index']}. {news['title']}")
93
+ st.write(news['summary'])
94
+ st.write(f"[Read more]({news['url']})")
95
+ st.divider()
96
+
97
+ # Test Area Tab
98
+ with tab3:
99
+ container1 = st.container()
100
+ container1.write("This is container 1")
101
+
102
+ container2 = st.container()
103
+ container2.write("This is container 2")
104
+
105
+ if __name__ == "__main__":
106
+ main()
categorizer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from nltk.tokenize import word_tokenize
3
+ import re
4
+ import pickle
5
+ import gensim.downloader as api
6
+
7
+
8
+ class NewsCategorizer:
9
+ def __init__(self, gensim_model_name='word2vec-google-news-300', model_file='logistic_regression_model.pkl', encoder_file='label_encoder.pkl'):
10
+ # Load gensim model
11
+ self.gensim_model = api.load(gensim_model_name)
12
+ print("New categorizer was called")
13
+
14
+ # Load classifier model and label encoder
15
+ with open(model_file, 'rb') as f:
16
+ self.model = pickle.load(f)
17
+
18
+ with open(encoder_file, 'rb') as le_file:
19
+ self.label_encoder = pickle.load(le_file)
20
+
21
+ def clean_text(self, text):
22
+ """Cleans the text by removing non-alphabetic characters and converting to lowercase."""
23
+ text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
24
+ text = text.lower()
25
+ return text
26
+
27
+ def get_word2vec_embeddings(self, tokens):
28
+ """Converts tokens to their word2vec embeddings."""
29
+ embeddings = []
30
+ for token in tokens:
31
+ if token in self.gensim_model:
32
+ embeddings.append(self.gensim_model[token]) # Get word vector
33
+ if embeddings:
34
+ return np.mean(embeddings, axis=0)
35
+ else:
36
+ return np.zeros(self.gensim_model.vector_size)
37
+
38
+ def predict_category(self, text):
39
+ """Predicts the category of the given text using the pre-trained model."""
40
+ cleaned_text = self.clean_text(text)
41
+ tokens = word_tokenize(cleaned_text)
42
+ embeddings = self.get_word2vec_embeddings(tokens)
43
+ embeddings = embeddings.reshape(1, -1)
44
+ predicted_label = self.model.predict(embeddings)[0]
45
+ predicted_category = self.label_encoder.inverse_transform([predicted_label])[0]
46
+ return predicted_category
47
+
48
+
49
+ # # Example Usage
50
+ # # Initialize the NewsCategorizer class
51
+ # categorizer = NewsCategorizer()
52
+
53
+ # # Example text for prediction
54
+ # unknown_text = """A horrifying incident in Sultanpuri, Delhi, has led to the arrest of Neeraj Solanki and four of his family members for allegedly killing and burying his three-day-old twin daughters. The police revealed that the act was driven by Solanki's preference for a male child. Following the birth of the twins on May 30, the newborns were taken to a cremation ground and buried after being killed. The investigation began after the children’s mother reported the crime to the police."""
55
+
56
+ # # Predict the category for the unknown text
57
+ # predicted_category = categorizer.predict_category(unknown_text)
58
+ # print(f"The predicted category is: {predicted_category}")
config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 2048,
8
+ "d_kv": 64,
9
+ "d_model": 768,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "tie_word_embeddings": false,
58
+ "torch_dtype": "float32",
59
+ "transformers_version": "4.44.2",
60
+ "use_cache": true,
61
+ "vocab_size": 32128
62
+ }
fetch_top_news.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from dotenv import load_dotenv, find_dotenv
3
+ dotenv_path = find_dotenv()
4
+ load_dotenv(dotenv_path)
5
+ import os
6
+ from datetime import datetime
7
+ import requests
8
+ from categorizer import NewsCategorizer
9
+ from summarizer import Summarizer
10
+ import json
11
+ current_day = datetime.now().strftime("%Y-%m-%d")
12
+ api_key = os.getenv("WORLD_NEWS_API")
13
+
14
+ # Instantiate the NewsCategorizer class
15
+ categorizer_instance = NewsCategorizer()
16
+ summarizer_instance = Summarizer(
17
+ model_path = r'C:\Users\TDH\VIIIIIV\MVIIXXIV\2024\Senior Project 1\huggingface_model_deployment\FlanT5_base_fine_tuned_for_Summarization',
18
+ tokenizer_path = r'C:\Users\TDH\VIIIIIV\MVIIXXIV\2024\Senior Project 1\huggingface_model_deployment\FlanT5_base_fine_tuned_for_Summarization\local_google_flan_t5_tokenizer'
19
+
20
+ )
21
+
22
+ def get_top_news(language="en", date=current_day,number=30):
23
+
24
+
25
+ print("get_top_news was called")
26
+
27
+ def save_news_to_file(news_list, filename="news_data.json"):
28
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
29
+ # Save data with a timestamp in the file
30
+ data_to_save = {
31
+ "timestamp": timestamp,
32
+ "news": news_list
33
+ }
34
+ with open(filename, 'w') as f:
35
+ json.dump(data_to_save, f)
36
+ print(f"News data saved to {filename} at {timestamp}")
37
+
38
+
39
+ base_url = "https://api.worldnewsapi.com/search-news"
40
+ news_list = [] # List to store the news articles
41
+
42
+ params = {
43
+ 'api-key': api_key,
44
+ 'language': language, # Language code, e.g., 'en' for English, 'th' for Thai
45
+ 'published_date': date, # Date in YYYY-MM-DD format
46
+ 'sort-by': 'relevance', # You can use 'relevance', 'popularity', or 'publishedAt'
47
+ 'number': number # Number of articles you want to retrieve (top 5 in this example)
48
+ }
49
+
50
+ response = requests.get(base_url, params=params)
51
+
52
+ if response.status_code == 200:
53
+ articles = response.json().get("news", [])
54
+
55
+ for i, article in enumerate(articles, start=1):
56
+ text = article.get("text", "")
57
+ predicted_category = categorizer_instance.predict_category(text)
58
+ # Generate summary using the summarizer
59
+ if text: # Ensure the text is not empty
60
+ summary = summarizer_instance.iterative_summarization(
61
+ text=text,
62
+ chunk_size=350,
63
+ overlap_size=50,
64
+ max_iterations=5,
65
+ use_prompt_template=True
66
+ )
67
+ else:
68
+ summary = "No summary available"
69
+
70
+ # Collect the data for each article
71
+ news_list.append({
72
+ "index": i,
73
+ "title": article.get("title", "No Title"),
74
+ "text": article.get("text", "No Text"), # First 10 characters of the text
75
+ "url": article.get("url", "No URL"),
76
+ "category": article.get("category", "Others"),
77
+ "summary": summary,
78
+ "predicted_category": predicted_category
79
+
80
+ })
81
+ save_news_to_file(news_list)
82
+ else:
83
+ print("Error:", response.status_code, response.text)
84
+
85
+ return news_list # Return the list of news articles
86
+
87
+ # # Example usage
88
+ # news = get_top_news(language="en", date="2024-11-19")
89
+ # for item in news:
90
+ # print(item)
91
+
92
+
93
+
94
+ # def get_top_news(language="en", date=current_day,number=30):
95
+ # temp = [
96
+ # {"index": 1, "title": "World Cup 2024: The Road to the Final", "text": "The World Cup 2024 is heating up, with top teams battling for a spot in the final... (rest of the article text)", "url": "https://example.com/world-cup-2024", "category": "Sports", "summary": "The World Cup 2024 is nearing its final stages with intense matches between the world's top teams.", "predicted_category": "Sports"},
97
+ # {"index": 2, "title": "Tech Giants Invest in AI for 2024", "text": "Major tech companies are doubling down on AI technologies with multi-billion dollar investments... (rest of the article text)", "url": "https://example.com/tech-giants-ai-investment", "category": "Science/Tech", "summary": "Tech companies are boosting AI investments, with the focus on automation and machine learning innovations.", "predicted_category": "Science/Tech"},
98
+ # {"index": 3, "title": "Stocks Hit Record High Amid Global Economic Recovery", "text": "The stock market has reached new heights, with positive news about global economic recovery... (rest of the article text)", "url": "https://example.com/stock-market-record-high", "category": "Business", "summary": "The stock market reaches an all-time high as signs of global economic recovery boost investor confidence.", "predicted_category": "Business"},
99
+ # {"index": 4, "title": "New Legislation Aims to Tackle Climate Change", "text": "Lawmakers are debating new legislation aimed at addressing climate change with stricter regulations... (rest of the article text)", "url": "https://example.com/climate-change-legislation", "category": "Politics", "summary": "New legislation to combat climate change is being debated, focusing on stricter environmental regulations.", "predicted_category": "Politics"},
100
+ # {"index": 5, "title": "AI Revolutionizing Healthcare Diagnostics", "text": "AI systems are making waves in healthcare, improving diagnostics and treatment recommendations... (rest of the article text)", "url": "https://example.com/ai-healthcare-diagnostics", "category": "Science/Tech", "summary": "AI is revolutionizing healthcare by improving diagnostic accuracy and treatment efficiency.", "predicted_category": "Science/Tech"},
101
+ # {"index": 6, "title": "Oscars 2024: The Big Winners", "text": "The Oscars 2024 saw exciting wins, with new films taking home prestigious awards... (rest of the article text)", "url": "https://example.com/oscars-2024-winners", "category": "Entertainment", "summary": "The 2024 Oscars highlighted new talent and groundbreaking films, with major wins in multiple categories.", "predicted_category": "Entertainment"},
102
+ # {"index": 7, "title": "Government Launches New Initiative for Rural Development", "text": "A new initiative has been launched by the government to boost economic development in rural areas... (rest of the article text)", "url": "https://example.com/rural-development-initiative", "category": "Politics", "summary": "The government has launched a new initiative aimed at improving infrastructure and opportunities in rural regions.", "predicted_category": "Politics"},
103
+ # {"index": 8, "title": "The Future of Renewable Energy: Solar Dominates", "text": "Solar energy is expected to dominate the renewable energy market in the coming decades... (rest of the article text)", "url": "https://example.com/future-renewable-energy-solar", "category": "Science/Tech", "summary": "Solar energy is poised to take the lead in renewable energy, driven by technological advancements and policy support.", "predicted_category": "Science/Tech"},
104
+ # {"index": 9, "title": "2024 Tokyo Marathon: The Race of the Year", "text": "The Tokyo Marathon 2024 is set to attract the best runners from across the globe... (rest of the article text)", "url": "https://example.com/tokyo-marathon-2024", "category": "Sports", "summary": "The Tokyo Marathon 2024 will feature top athletes and promising new talent, making it one of the biggest races of the year.", "predicted_category": "Sports"},
105
+ # {"index": 10, "title": "Breaking News: Major Tech Merger Announced", "text": "Two major tech firms have announced a merger, which is expected to reshape the industry... (rest of the article text)", "url": "https://example.com/tech-merger-announcement", "category": "Business", "summary": "A major tech merger has been announced, signaling significant shifts in the technology sector.", "predicted_category": "Business"},
106
+ # {"index": 11, "title": "Political Tensions Rise Ahead of Election", "text": "With the upcoming election, political tensions are escalating as parties prepare for a fierce battle... (rest of the article text)", "url": "https://example.com/political-tensions-election", "category": "Politics", "summary": "Political tensions are rising as election campaigns intensify, with major candidates vying for leadership.", "predicted_category": "Politics"},
107
+ # {"index": 12, "title": "NBA Finals 2024: The Ultimate Showdown", "text": "The 2024 NBA Finals is shaping up to be one of the most thrilling matchups in basketball history... (rest of the article text)", "url": "https://example.com/nba-finals-2024", "category": "Sports", "summary": "The 2024 NBA Finals promises an epic showdown between two powerhouse teams, drawing attention from sports fans worldwide.", "predicted_category": "Sports"},
108
+ # {"index": 13, "title": "New Cybersecurity Threats: How to Protect Your Data", "text": "As cyber threats continue to evolve, experts are warning about new vulnerabilities affecting businesses and individuals... (rest of the article text)", "url": "https://example.com/cybersecurity-threats-2024", "category": "Science/Tech", "summary": "New cybersecurity threats are emerging, and experts provide tips on how to protect your data from malicious actors.", "predicted_category": "Science/Tech"},
109
+ # {"index": 14, "title": "Hollywood Stars Attend Gala for Charity", "text": "A star-studded charity gala was held last night, raising millions for a good cause... (rest of the article text)", "url": "https://example.com/hollywood-charity-gala", "category": "Entertainment", "summary": "A gala event in Hollywood saw major stars come together to raise millions for charity, highlighting philanthropic efforts in the industry.", "predicted_category": "Entertainment"},
110
+ # {"index": 15, "title": "Oil Prices Hit New Highs: Global Market Impact", "text": "Oil prices have hit record highs this month, affecting global markets and driving up fuel costs... (rest of the article text)", "url": "https://example.com/oil-prices-new-highs", "category": "Business", "summary": "Oil prices have surged, influencing global market trends and leading to higher fuel prices across the world.", "predicted_category": "Business"},
111
+ # {"index": 16, "title": "Exploring Mars: The Latest Space Mission Updates", "text": "NASA's latest Mars mission has yielded fascinating new insights about the planet's surface... (rest of the article text)", "url": "https://example.com/mars-space-mission-2024", "category": "Science/Tech", "summary": "NASA's latest Mars mission has provided new data about the planet's surface, shedding light on its potential for future exploration.", "predicted_category": "Science/Tech"},
112
+ # {"index": 17, "title": "New National Sports Policy to Boost Youth Involvement", "text": "The government has announced a new national sports policy aimed at increasing youth participation in sports... (rest of the article text)", "url": "https://example.com/national-sports-policy-2024", "category": "Sports", "summary": "A new national sports policy has been launched to boost youth engagement in physical activities and competitions.", "predicted_category": "Sports"},
113
+ # {"index": 18, "title": "Entertainment Industry Faces Challenges Amid Strike", "text": "The entertainment industry is facing significant disruptions as workers go on strike over pay and conditions... (rest of the article text)", "url": "https://example.com/entertainment-strike-2024", "category": "Entertainment", "summary": "The entertainment industry is experiencing major disruptions due to strikes over pay and working conditions.", "predicted_category": "Entertainment"},
114
+ # {"index": 19, "title": "China's Growing Influence in Global Politics", "text": "China's political influence continues to grow, shaping global policy decisions and economic trends... (rest of the article text)", "url": "https://example.com/china-global-influence", "category": "Politics", "summary": "China's political and economic influence is rapidly increasing, affecting global decision-making and alliances.", "predicted_category": "Politics"},
115
+ # {"index": 20, "title": "AI-Powered Tools Change the Game for Content Creators", "text": "Content creators are increasingly relying on AI-powered tools to streamline their production process and boost engagement... (rest of the article text)", "url": "https://example.com/ai-content-creators", "category": "Science/Tech", "summary": "AI-powered tools are revolutionizing content creation, helping creators improve efficiency and engagement with their audiences.", "predicted_category": "Science/Tech"}
116
+ # ]
117
+ # return temp
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "decoder_start_token_id": 0,
3
+ "eos_token_id": 1,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.44.2"
6
+ }
label_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdd75907b5940f619aac3bd28132a6b15af2bc628d0e676563c27e34db3f7865
3
+ size 317
logistic_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acaee881e6d08738879f588e71b3a01a3f84641e38d0778e5ebef72b47b0d354
3
+ size 17602
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:197b346658905cda50c1a71fe0eb77a4d79306adaeb5d7a3c0e9208c91022d45
3
+ size 990345064
news_data.json ADDED
The diff for this file is too large to render. See raw diff
 
summarizer.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # import torch
2
+ # from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ # model = AutoModelForSeq2SeqLM.from_pretrained(r'FlanT5_base_fine_tuned_for_Summarization\model.safetensors')
4
+
5
+ # tokenizer = AutoTokenizer.from_pretrained("local_google_flan_t5_tokenizer")
6
+
7
+ # import torch
8
+
9
+ # def split_text(text, chunk_size=300, overlap_size=50):
10
+ # words = text.split()
11
+ # chunks = []
12
+ # for i in range(0, len(words), chunk_size - overlap_size):
13
+ # chunk = ' '.join(words[i:i + chunk_size])
14
+ # chunks.append(chunk)
15
+ # return chunks
16
+
17
+ # def summarize_chunks(model, tokenizer, chunks, use_prompt_template):
18
+ # summaries = []
19
+ # for chunk in chunks:
20
+ # # Tokenizing the input chunk
21
+ # if use_prompt_template:
22
+ # prompt = "Summarize the following news article. The summary should consist of multiple short sentences that cover the key points of the news article.Make relations between the news articles. \n\nNews Article: "
23
+ # inputs = tokenizer(prompt + chunk, return_tensors='pt', padding=True, truncation=True, max_length=375-40, add_special_tokens=True)
24
+ # else:
25
+ # inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=375, add_special_tokens=True)
26
+
27
+ # # Generate the summary
28
+ # model_output = model.generate(
29
+ # **inputs,
30
+ # min_length=10,
31
+ # max_length=200
32
+ # )
33
+ # summaries.append(tokenizer.decode(model_output[0], skip_special_tokens=True))
34
+ # return summaries
35
+
36
+ # def iterative_summarization(model, tokenizer, text, chunk_size=300, overlap_size=50, max_iterations=5, use_prompt_template=True):
37
+ # current_text = text
38
+ # iteration = 0
39
+
40
+ # while len(current_text.split()) > chunk_size and iteration < max_iterations:
41
+ # print(f"Iteration {iteration + 1}: Text length = {len(current_text.split())}")
42
+ # # Step 1: Split the text into smaller chunks
43
+ # chunks = split_text(current_text, chunk_size, overlap_size)
44
+
45
+ # # Step 2: Summarize each chunk
46
+ # chunk_summaries = summarize_chunks(model, tokenizer, chunks, use_prompt_template)
47
+
48
+ # # Step 3: Combine summaries
49
+ # current_text = ' '.join(chunk_summaries)
50
+ # iteration += 1
51
+
52
+ # return current_text
53
+
54
+ # # Example usage
55
+ # final_summary = iterative_summarization(
56
+ # model=model,
57
+ # tokenizer=tokenizer,
58
+ # text=long_text,
59
+ # chunk_size=350,
60
+ # overlap_size=50,
61
+ # max_iterations=5,
62
+ # use_prompt_template=False
63
+ # )
64
+
65
+ # print("Final Summary:", final_summary)
66
+
67
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
68
+
69
+ class Summarizer:
70
+ def __init__(self, model_path, tokenizer_path):
71
+ """
72
+ Initialize the Summarizer with the model and tokenizer.
73
+ """
74
+ print("News Summarizer was called")
75
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
76
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
77
+
78
+ def split_text(self, text, chunk_size=300, overlap_size=50):
79
+ """
80
+ Split text into chunks with overlap.
81
+ """
82
+ words = text.split()
83
+ chunks = []
84
+ for i in range(0, len(words), chunk_size - overlap_size):
85
+ chunk = ' '.join(words[i:i + chunk_size])
86
+ chunks.append(chunk)
87
+ return chunks
88
+
89
+ def summarize_chunks(self, chunks, use_prompt_template):
90
+ """
91
+ Summarize each chunk of text using the model.
92
+ """
93
+ summaries = []
94
+ for chunk in chunks:
95
+ # Prepare the prompt
96
+ if use_prompt_template:
97
+ prompt = (
98
+ "Summarize the following news article. The summary should consist of multiple short sentences "
99
+ "that cover the key points of the news article. Make relations between the news articles.\n\n"
100
+ "Summarize News Article: "
101
+ )
102
+ inputs = self.tokenizer(
103
+ prompt + chunk,
104
+ return_tensors='pt',
105
+ padding=True,
106
+ truncation=True,
107
+ max_length=375 - 40,
108
+ add_special_tokens=True
109
+ )
110
+ else:
111
+ inputs = self.tokenizer(
112
+ chunk,
113
+ return_tensors='pt',
114
+ padding=True,
115
+ truncation=True,
116
+ max_length=375,
117
+ add_special_tokens=True
118
+ )
119
+
120
+ # Generate the summary
121
+ model_output = self.model.generate(
122
+ **inputs,
123
+ min_length=10,
124
+ max_length=200
125
+ )
126
+ summaries.append(self.tokenizer.decode(model_output[0], skip_special_tokens=True))
127
+ return summaries
128
+
129
+ def iterative_summarization(self, text, chunk_size=300, overlap_size=50, max_iterations=5, use_prompt_template=True):
130
+ """
131
+ Perform iterative summarization to condense text into a final summary.
132
+ """
133
+ current_text = text
134
+ iteration = 0
135
+
136
+ while len(current_text.split()) > chunk_size and iteration < max_iterations:
137
+ print(f"Iteration {iteration + 1}: Text length = {len(current_text.split())}")
138
+ # Step 1: Split the text into smaller chunks
139
+ chunks = self.split_text(current_text, chunk_size, overlap_size)
140
+
141
+ # Step 2: Summarize each chunk
142
+ chunk_summaries = self.summarize_chunks(chunks, use_prompt_template)
143
+
144
+ # Step 3: Combine summaries
145
+ current_text = ' '.join(chunk_summaries)
146
+ iteration += 1
147
+
148
+ return current_text
149
+
150
+
151
+ # Example usage
152
+ if __name__ == "__main__":
153
+ model_path = r'C:\Users\TDH\VIIIIIV\MVIIXXIV\2024\Senior Project 1\huggingface_model_deployment\FlanT5_base_fine_tuned_for_Summarization'
154
+ tokenizer_path = r'C:\Users\TDH\VIIIIIV\MVIIXXIV\2024\Senior Project 1\huggingface_model_deployment\FlanT5_base_fine_tuned_for_Summarization\local_google_flan_t5_tokenizer'
155
+
156
+
157
+ long_text = """If you've been following the news lately, there are certain things you doubtless know about Mohammad Javad Zarif. He is, of course, the Iranian foreign minister. He has been U.S. Secretary of State John Kerry's opposite number in securing a breakthrough in nuclear discussions that could lead to an end to sanctions against Iran -- if the details can be worked out in the coming weeks. And he received a hero's welcome as he arrived in Iran on a sunny Friday morning. "Long live Zarif," crowds chanted as his car rolled slowly down the packed street. You may well have read that he is "polished" and, unusually for one burdened with such weighty issues, "jovial." An Internet search for "Mohammad Javad Zarif" and "jovial" yields thousands of results. He certainly has gone a long way to bring Iran in from the cold and allow it to rejoin the international community. But there are some facts about Zarif that are less well-known. Here are six: . In September 2013, Zarif tweeted "Happy Rosh Hashanah," referring to the Jewish New Year. That prompted Christine Pelosi, the daughter of House Minority Leader Nancy Pelosi, to respond with a tweet of her own: "Thanks. The New Year would be even sweeter if you would end Iran's Holocaust denial, sir." And, perhaps to her surprise, Pelosi got a response. "Iran never denied it," Zarif tweeted back. "The man who was perceived to be denying it is now gone. Happy New Year." The reference was likely to former Iranian President Mahmoud Ahmadinejad, who had left office the previous month. Zarif was nominated to be foreign minister by Ahmadinejad's successor, Hassan Rouhami. His foreign ministry notes, perhaps defensively, that "due to the political and security conditions of the time, he decided to continue his education in the United States." That is another way of saying that he was outside the country during the demonstrations against the Shah of Iran, which began in 1977, and during the Iranian Revolution, which drove the shah from power in 1979. Zarif left the country in 1977, received his undergraduate degree from San Francisco State University in 1981, his master's in international relations from the University of Denver in 1984 and his doctorate from the University of Denver in 1988. Both of his children were born in the United States. The website of the Iranian Foreign Ministry, which Zarif runs, cannot even agree with itself on when he was born. The first sentence of his official biography, perhaps in a nod to the powers that be in Tehran, says Zarif was "born to a religious traditional family in Tehran in 1959." Later on the same page, however, his date of birth is listed as January 8, 1960. And the Iranian Diplomacy website says he was born in in 1961 . So he is 54, 55 or maybe even 56. Whichever, he is still considerably younger than his opposite number, Kerry, who is 71. The feds investigated him over his alleged role in controlling the Alavi Foundation, a charitable organization. The U.S. Justice Department said the organization was secretly run on behalf of the Iranian government to launder money and get around U.S. sanctions. But last year, a settlement in the case, under which the foundation agreed to give a 36-story building in Manhattan along with other properties to the U.S. government, did not mention Zarif's name. Early in the Iranian Revolution, Zarif was among the students who took over the Iranian Consulate in San Francisco. The aim, says the website Iranian.com -- which cites Zarif's memoirs, titled "Mr. Ambassador" -- was to expel from the consulate people who were not sufficiently Islamic. Later, the website says, Zarif went to make a similar protest at the Iranian mission to the United Nations. In response, the Iranian ambassador to the United Nations offered him a job. In fact, he has now spent more time with Kerry than any other foreign minister in the world. And that amount of quality time will only increase as the two men, with help from other foreign ministers as well, try to meet a June 30 deadline for nailing down the details of the agreement they managed to outline this week in Switzerland.
158
+
159
+ """
160
+
161
+ summarizer = Summarizer(model_path, tokenizer_path)
162
+
163
+ final_summary = summarizer.iterative_summarization(
164
+ text=long_text,
165
+ chunk_size=350,
166
+ overlap_size=50,
167
+ max_iterations=5,
168
+ use_prompt_template=True
169
+ )
170
+
171
+ print("Final Summary:", final_summary)