Spaces:
Runtime error
Runtime error
Added Redis caching for feed content and article summaries.
Browse files- functions/feed_extraction.py +27 -16
- functions/summarization.py +21 -13
- functions/tools.py +20 -31
functions/feed_extraction.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import re
|
|
|
|
| 5 |
import logging
|
| 6 |
import urllib.request
|
| 7 |
from urllib.error import HTTPError, URLError
|
|
@@ -110,28 +111,38 @@ def parse_feed(feed_uri: str) -> list:
|
|
| 110 |
|
| 111 |
if 'title' in entry and 'link' in entry:
|
| 112 |
|
| 113 |
-
|
| 114 |
-
entry_content['link'] = entry.link
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
|
| 118 |
-
|
|
|
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
|
| 123 |
-
#
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
|
|
|
| 129 |
|
| 130 |
-
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
entry_content
|
| 135 |
|
| 136 |
entries[i] = entry_content
|
| 137 |
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
+
import json
|
| 6 |
import logging
|
| 7 |
import urllib.request
|
| 8 |
from urllib.error import HTTPError, URLError
|
|
|
|
| 111 |
|
| 112 |
if 'title' in entry and 'link' in entry:
|
| 113 |
|
| 114 |
+
title = entry.title
|
|
|
|
| 115 |
|
| 116 |
+
# Check the Redis cache for this entry
|
| 117 |
+
cache_key = title.lower().replace(' ', '_')
|
| 118 |
+
cache_hit = False
|
| 119 |
+
cached_entry = REDIS.get(cache_key)
|
| 120 |
|
| 121 |
+
if cached_entry:
|
| 122 |
+
cache_hit = True
|
| 123 |
+
entry_content = json.loads(cached_entry)
|
| 124 |
+
logger.info('Entry in Redis cache: "%s"', title)
|
| 125 |
|
| 126 |
+
# If its not in the Redis cache, parse it from the feed data
|
| 127 |
+
else:
|
| 128 |
+
entry_content['title'] = entry.title
|
| 129 |
+
entry_content['link'] = entry.link
|
| 130 |
+
entry_content['content'] = None
|
| 131 |
+
|
| 132 |
+
if 'content' in entry:
|
| 133 |
+
entry_content['content'] = entry.content
|
| 134 |
+
|
| 135 |
+
if entry_content['content'] is None:
|
| 136 |
|
| 137 |
+
html = _get_html(entry_content['link'])
|
| 138 |
+
content = _get_text(html)
|
| 139 |
+
entry_content['content'] = content
|
| 140 |
|
| 141 |
+
logger.info('Parsed entry: "%s"', title)
|
| 142 |
|
| 143 |
+
# Add it to the Redis cache if it wasn't there
|
| 144 |
+
if cache_hit is False:
|
| 145 |
+
REDIS.set(cache_key, entry_content)
|
| 146 |
|
| 147 |
entries[i] = entry_content
|
| 148 |
|
functions/summarization.py
CHANGED
|
@@ -4,9 +4,14 @@ import os
|
|
| 4 |
import logging
|
| 5 |
|
| 6 |
from openai import OpenAI
|
|
|
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
def summarize_content(content: str) -> str:
|
| 10 |
'''Generates summary of article content using Modal inference endpoint.
|
| 11 |
|
| 12 |
Args:
|
|
@@ -19,6 +24,15 @@ def summarize_content(content: str) -> str:
|
|
| 19 |
logger = logging.getLogger(__name__ + '.summarize_content')
|
| 20 |
logger.info('Summarizing extracted content')
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
| 23 |
|
| 24 |
client.base_url = (
|
|
@@ -29,16 +43,6 @@ def summarize_content(content: str) -> str:
|
|
| 29 |
model = client.models.list().data[0]
|
| 30 |
model_id = model.id
|
| 31 |
|
| 32 |
-
# messages = [
|
| 33 |
-
# {
|
| 34 |
-
# 'role': 'system',
|
| 35 |
-
# 'content': ('You are a research assistant, skilled in summarizing documents in just '+
|
| 36 |
-
# 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
|
| 37 |
-
# 'role': 'user',
|
| 38 |
-
# 'content': content
|
| 39 |
-
# }
|
| 40 |
-
# ]
|
| 41 |
-
|
| 42 |
messages = [
|
| 43 |
{
|
| 44 |
'role': 'system',
|
|
@@ -68,7 +72,11 @@ def summarize_content(content: str) -> str:
|
|
| 68 |
logger.error('Error during Modal API call: %s', e)
|
| 69 |
|
| 70 |
if response is not None:
|
| 71 |
-
|
| 72 |
|
| 73 |
else:
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import logging
|
| 5 |
|
| 6 |
from openai import OpenAI
|
| 7 |
+
from upstash_redis import Redis
|
| 8 |
|
| 9 |
+
REDIS = Redis(
|
| 10 |
+
url='https://sensible-midge-19304.upstash.io',
|
| 11 |
+
token=os.environ['UPSTASH_REDIS_KEY']
|
| 12 |
+
)
|
| 13 |
|
| 14 |
+
def summarize_content(title: str, content: str) -> str:
|
| 15 |
'''Generates summary of article content using Modal inference endpoint.
|
| 16 |
|
| 17 |
Args:
|
|
|
|
| 24 |
logger = logging.getLogger(__name__ + '.summarize_content')
|
| 25 |
logger.info('Summarizing extracted content')
|
| 26 |
|
| 27 |
+
# Check Redis cache for summary
|
| 28 |
+
cache_key = f"{title.lower().replace(' ', '_')}-summary"
|
| 29 |
+
cached_summary = REDIS.get(cache_key)
|
| 30 |
+
|
| 31 |
+
if cached_summary:
|
| 32 |
+
logger.info('Got summary from Redis cache: "%s"', title)
|
| 33 |
+
return cached_summary
|
| 34 |
+
|
| 35 |
+
# It the summary is not in the cache, generate it
|
| 36 |
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
| 37 |
|
| 38 |
client.base_url = (
|
|
|
|
| 43 |
model = client.models.list().data[0]
|
| 44 |
model_id = model.id
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
messages = [
|
| 47 |
{
|
| 48 |
'role': 'system',
|
|
|
|
| 72 |
logger.error('Error during Modal API call: %s', e)
|
| 73 |
|
| 74 |
if response is not None:
|
| 75 |
+
summary = response.choices[0].message.content
|
| 76 |
|
| 77 |
else:
|
| 78 |
+
summary = None
|
| 79 |
+
|
| 80 |
+
REDIS.set(cache_key, summary)
|
| 81 |
+
logger.info('Summarized: "%s"', title)
|
| 82 |
+
return summary
|
functions/tools.py
CHANGED
|
@@ -6,20 +6,15 @@ import logging
|
|
| 6 |
import functions.feed_extraction as extraction_funcs
|
| 7 |
import functions.summarization as summarization_funcs
|
| 8 |
|
| 9 |
-
LOCAL_CACHE = {
|
| 10 |
-
'get_feed': {}
|
| 11 |
-
}
|
| 12 |
|
| 13 |
-
def get_feed(website: str
|
| 14 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
| 15 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
| 16 |
feed and return title, summary and link to full article for most recent
|
| 17 |
-
items in feed
|
| 18 |
|
| 19 |
Args:
|
| 20 |
website: URL or name of website to extract RSS feed content from
|
| 21 |
-
use_cache: check local cache for content from RSS feed first before
|
| 22 |
-
downloading data from the website's RSS feed
|
| 23 |
|
| 24 |
Returns:
|
| 25 |
JSON string containing the feed content or 'No feed found' if a RSS
|
|
@@ -31,35 +26,29 @@ def get_feed(website: str, use_cache: bool = True) -> list:
|
|
| 31 |
logger = logging.getLogger(__name__ + '.get_feed()')
|
| 32 |
logger.info('Getting feed content for: %s', website)
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
logger.info('Got feed content from local cache')
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
return 'No feed found'
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
|
| 54 |
-
for i, item in content.items():
|
| 55 |
-
|
| 56 |
-
if item['content'] is not None:
|
| 57 |
-
summary = summarization_funcs.summarize_content(item['content'])
|
| 58 |
-
content[i]['summary'] = summary
|
| 59 |
-
|
| 60 |
-
content[i].pop('content', None)
|
| 61 |
-
|
| 62 |
-
LOCAL_CACHE['get_feed'][website] = content
|
| 63 |
|
| 64 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
| 65 |
|
|
|
|
| 6 |
import functions.feed_extraction as extraction_funcs
|
| 7 |
import functions.summarization as summarization_funcs
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
def get_feed(website: str) -> list:
|
| 11 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
| 12 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
| 13 |
feed and return title, summary and link to full article for most recent
|
| 14 |
+
items in feed.
|
| 15 |
|
| 16 |
Args:
|
| 17 |
website: URL or name of website to extract RSS feed content from
|
|
|
|
|
|
|
| 18 |
|
| 19 |
Returns:
|
| 20 |
JSON string containing the feed content or 'No feed found' if a RSS
|
|
|
|
| 26 |
logger = logging.getLogger(__name__ + '.get_feed()')
|
| 27 |
logger.info('Getting feed content for: %s', website)
|
| 28 |
|
| 29 |
+
# Find the feed's URI from the website name/URL
|
| 30 |
+
feed_uri = extraction_funcs.find_feed_uri(website)
|
| 31 |
+
logger.info('find_feed_uri() returned %s', feed_uri)
|
|
|
|
| 32 |
|
| 33 |
+
if 'No feed found' in feed_uri:
|
| 34 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
| 35 |
+
return 'No feed found'
|
| 36 |
|
| 37 |
+
# Parse and extract content from the feed
|
| 38 |
+
content = extraction_funcs.parse_feed(feed_uri)
|
| 39 |
+
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
| 40 |
|
| 41 |
+
# Summarize each post in the feed
|
| 42 |
+
for i, item in content.items():
|
|
|
|
| 43 |
|
| 44 |
+
if item['content'] is not None:
|
| 45 |
+
summary = summarization_funcs.summarize_content(
|
| 46 |
+
item['title'],
|
| 47 |
+
item['content']
|
| 48 |
+
)
|
| 49 |
+
content[i]['summary'] = summary
|
| 50 |
|
| 51 |
+
content[i].pop('content', None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
| 54 |
|