Spaces:

Shreyas94
/

World_News

Sleeping

App Files Files Community

Shreyas94 commited on Jun 15, 2024

Commit

d530acf

verified ·

1 Parent(s): fe87932

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -70

app.py CHANGED Viewed

@@ -2,26 +2,12 @@ import gradio as gr
 from transformers import pipeline
 from bs4 import BeautifulSoup
 import requests
 import logging
-from newsapi import NewsApiClient
 # Configure logging
 logging.basicConfig(level=logging.DEBUG)
-# Initialize the News API client
-newsapi = NewsApiClient(api_key='5ab7bb1aaceb41b8993db03477098aad')
-def fetch_article_content(url):
-    try:
-        r = requests.get(url)
-        soup = BeautifulSoup(r.text, 'html.parser')
-        results = soup.find_all(['h1', 'p'])
-        text = [result.text for result in results]
-        return ' '.join(text)
-    except Exception as e:
-        logging.error(f"Error fetching content from {url}: {e}")
-        return ""
 def summarize_news(query, num_results=3):
     logging.debug(f"Query received: {query}")
     logging.debug(f"Number of results requested: {num_results}")
@@ -32,69 +18,75 @@ def summarize_news(query, num_results=3):
     # Search for news articles
     logging.debug("Searching for news articles...")
     articles = []
-    aggregated_content = ""
-    try:
-        news_results = newsapi.get_everything(q=query, language='en', page_size=num_results)
-        logging.debug(f"Search results: {news_results}")
-        for article in news_results['articles']:
-            url = article['url']
             logging.debug(f"Fetching content from URL: {url}")
-            content = fetch_article_content(url)
-            aggregated_content += content + " "
-    except Exception as e:
-        logging.error(f"Error fetching news articles: {e}")
-    # Chunk the aggregated content
-    logging.debug("Chunking the aggregated content...")
-    max_chunk = 500
-    aggregated_content = aggregated_content.replace('.', '.<eos>')
-    aggregated_content = aggregated_content.replace('?', '?<eos>')
-    aggregated_content = aggregated_content.replace('!', '!<eos>')
-    sentences = aggregated_content.split('<eos>')
-    current_chunk = 0
-    chunks = []
-    for sentence in sentences:
-        if len(chunks) == current_chunk + 1:
-            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
-                chunks[current_chunk].extend(sentence.split(' '))
-            else:
-                current_chunk += 1
-                chunks.append(sentence.split(' '))
-        else:
-            chunks.append(sentence.split(' '))
-    for chunk_id in range(len(chunks)):
-        chunks[chunk_id] = ' '.join(chunks[chunk_id])
-    logging.debug(f"Chunks created: {chunks}")
-    # Summarize the chunks
-    logging.debug("Summarizing the chunks...")
-    try:
-        summaries = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
-        summary_text = " ".join([summary['summary_text'] for summary in summaries])
-        # Reprocess the generated summary
-        logging.debug("Reprocessing the summary for cohesiveness and elaboration...")
-        final_summary = summarizer(summary_text, max_length=150, min_length=60, do_sample=False)[0]['summary_text']
-    except Exception as e:
-        logging.error(f"Error during summarization: {e}")
-        final_summary = "An error occurred during summarization."
-    logging.debug(f"Final summarized text: {final_summary}")
-    return final_summary
 iface = gr.Interface(
     fn=summarize_news,
-    inputs=[gr.Textbox(label="Query"), gr.Slider(minimum=1, maximum=10, value=3, label="Number of Results")],
     outputs="textbox",
     title="News Summarizer",
-    description="Enter a query to get a consolidated summary of the top news articles."
 )
 if __name__ == "__main__":
     logging.debug("Launching Gradio interface...")
-    iface.launch()

 from transformers import pipeline
 from bs4 import BeautifulSoup
 import requests
+from googlesearch import search
 import logging
 # Configure logging
 logging.basicConfig(level=logging.DEBUG)
 def summarize_news(query, num_results=3):
     logging.debug(f"Query received: {query}")
     logging.debug(f"Number of results requested: {num_results}")
     # Search for news articles
     logging.debug("Searching for news articles...")
+    search_results = search(query, num_results=num_results)
     articles = []
+    logging.debug(f"Search results: {search_results}")
+    for url in search_results:
+        try:
             logging.debug(f"Fetching content from URL: {url}")
+            # Fetch the content of the news article
+            r = requests.get(url)
+            soup = BeautifulSoup(r.text, 'html.parser')
+            results = soup.find_all(['h1', 'p'])
+            text = [result.text for result in results]
+            ARTICLE = ' '.join(text)
+            # Chunk the article text
+            logging.debug("Chunking the article text...")
+            max_chunk = 350
+            ARTICLE = ARTICLE.replace('.', '.<eos>')
+            ARTICLE = ARTICLE.replace('?', '?<eos>')
+            ARTICLE = ARTICLE.replace('!', '!<eos>')
+            sentences = ARTICLE.split('<eos>')
+            chunks = []
+            current_chunk = []
+            for sentence in sentences:
+                if len(' '.join(current_chunk + sentence.split())) <= max_chunk:
+                    current_chunk.extend(sentence.split())
+                else:
+                    chunks.append(' '.join(current_chunk))
+                    current_chunk = sentence.split()
+            chunks.append(' '.join(current_chunk))
+            logging.debug(f"Chunks created: {chunks}")
+            # Summarize the chunks
+            logging.debug("Summarizing the chunks...")
+            summaries = []
+            for chunk in chunks:
+                summaries.append(summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text'])
+            # Concatenate summaries and summarize again for cohesion
+            combined_summary = ' '.join(summaries)
+            final_summary = summarizer(combined_summary, max_length=300, min_length=80, do_sample=False)[0]['summary_text']
+            articles.append((url, final_summary))
+            logging.debug(f"Final summary for URL {url}: {final_summary}")
+        except Exception as e:
+            logging.error(f"Error processing URL {url}: {e}")
+            continue
+    logging.debug(f"Final summarized articles: {articles}")
+    return format_output(articles)
+def format_output(articles):
+    formatted_text = ""
+    for url, summary in articles:
+        formatted_text += f"URL: {url}\nSummary: {summary}\n\n"
+    return formatted_text
 iface = gr.Interface(
     fn=summarize_news,
+    inputs=["textbox", "slider"],
     outputs="textbox",
     title="News Summarizer",
+    description="Enter a query to get summarized versions of the top news articles."
 )
 if __name__ == "__main__":
     logging.debug("Launching Gradio interface...")
+    iface.launch()