michaelkri commited on
Commit
18cd44b
·
1 Parent(s): ca9a164

Improved logging

Browse files
app/database.py CHANGED
@@ -4,6 +4,7 @@ from sqlalchemy.sql import func
4
  from contextlib import contextmanager
5
  import os
6
  import datetime
 
7
 
8
 
9
  class Base(DeclarativeBase):
@@ -41,15 +42,15 @@ TURSO_AUTH_TOKEN = os.getenv('TURSO_AUTH_TOKEN')
41
 
42
  # create an engine
43
  if not TURSO_DATABASE_URL or not TURSO_AUTH_TOKEN:
44
- print('Using local SQLite database')
45
- engine = create_engine('sqlite:///news.db', echo=True)
46
  else:
47
  engine = create_engine(
48
  f'sqlite+{TURSO_DATABASE_URL}?secure=true',
49
  connect_args={
50
  'auth_token': TURSO_AUTH_TOKEN
51
  },
52
- echo=True,
53
  )
54
 
55
  # create tables if needed
 
4
  from contextlib import contextmanager
5
  import os
6
  import datetime
7
+ import logging
8
 
9
 
10
  class Base(DeclarativeBase):
 
42
 
43
  # create an engine
44
  if not TURSO_DATABASE_URL or not TURSO_AUTH_TOKEN:
45
+ logging.info('Using local SQLite database')
46
+ engine = create_engine('sqlite:///news.db', echo=False)
47
  else:
48
  engine = create_engine(
49
  f'sqlite+{TURSO_DATABASE_URL}?secure=true',
50
  connect_args={
51
  'auth_token': TURSO_AUTH_TOKEN
52
  },
53
+ echo=False,
54
  )
55
 
56
  # create tables if needed
app/news_fetcher.py CHANGED
@@ -2,12 +2,13 @@ import feedparser
2
  from .summarizer import Summarizer
3
  from .scraper import get_articles
4
  from .database import Article, Source
 
5
 
6
 
7
  def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
8
- print(f'Beginning search for \'{query}\'...')
9
  articles_and_urls = get_articles(query, max_results=max_results)
10
- print(f'Retrieved {len(articles_and_urls)} articles')
11
  articles = [item[0] for item in articles_and_urls]
12
  urls = [item[1] for item in articles_and_urls]
13
  summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
 
2
  from .summarizer import Summarizer
3
  from .scraper import get_articles
4
  from .database import Article, Source
5
+ import logging
6
 
7
 
8
  def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
9
+ logging.debug(f'Beginning search for \'{query}\'...')
10
  articles_and_urls = get_articles(query, max_results=max_results)
11
+ logging.debug(f'Retrieved {len(articles_and_urls)} articles')
12
  articles = [item[0] for item in articles_and_urls]
13
  urls = [item[1] for item in articles_and_urls]
14
  summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
app/scraper.py CHANGED
@@ -1,5 +1,6 @@
1
  from ddgs import DDGS
2
  import trafilatura
 
3
 
4
 
5
  def retrieve_article(url: str) -> str:
@@ -7,7 +8,8 @@ def retrieve_article(url: str) -> str:
7
  page = trafilatura.fetch_url(url)
8
  return trafilatura.extract(page)
9
  except Exception as e:
10
- print(e.args)
 
11
  return None
12
 
13
 
@@ -17,7 +19,8 @@ def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str
17
  try:
18
  search_results = DDGS().news(query, timelimit='d', max_results=max_results)
19
  except Exception as e:
20
- print(e.args)
 
21
 
22
  # get article urls
23
  urls = set([r['url'] for r in search_results])
 
1
  from ddgs import DDGS
2
  import trafilatura
3
+ import logging
4
 
5
 
6
  def retrieve_article(url: str) -> str:
 
8
  page = trafilatura.fetch_url(url)
9
  return trafilatura.extract(page)
10
  except Exception as e:
11
+ logging.debug(e.args)
12
+ logging.error('Error retrieving article')
13
  return None
14
 
15
 
 
19
  try:
20
  search_results = DDGS().news(query, timelimit='d', max_results=max_results)
21
  except Exception as e:
22
+ logging.debug(e.args)
23
+ logging.error('Error searching for articles')
24
 
25
  # get article urls
26
  urls = set([r['url'] for r in search_results])
app/summarizer.py CHANGED
@@ -3,6 +3,7 @@ from nltk import tokenize
3
  from sklearn.cluster import HDBSCAN
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
 
6
 
7
 
8
  class Summarizer:
@@ -71,7 +72,7 @@ class Summarizer:
71
  # combine key sentences from each cluster
72
  key_sentences = []
73
  for i, cluster in enumerate(clusters):
74
- print(f'Extracting from cluster {i + 1}...')
75
  top_sentences = Summarizer.rank_cluster_sentences(cluster)
76
  content = '\n'.join(top_sentences[:3])
77
  key_sentences.append(content)
@@ -79,7 +80,7 @@ class Summarizer:
79
  combined = ' '.join(key_sentences)
80
 
81
  # summarize all key sentences
82
- print('Creating response...')
83
  summary = self.summarize(
84
  combined,
85
  min_length=60,
@@ -92,7 +93,7 @@ class Summarizer:
92
  if not articles:
93
  return None
94
 
95
- print('Tokenizing into sentences...')
96
  # create a list of all sentences from all articles
97
  sentences = []
98
  for article in articles:
@@ -100,19 +101,19 @@ class Summarizer:
100
 
101
  # remove duplicate sentences
102
  sentences = sorted(list(set(sentences)), key=sentences.index)
103
- print(f'Found {len(sentences)} unique sentences')
104
 
105
  if not sentences:
106
  return None
107
 
108
- print('Creating sentence embeddings...')
109
  # create embeddings
110
  embeddings = self.create_embeddings(sentences)
111
 
112
- print('Grouping sentences into clusters...')
113
  # group (embeddings of) sentences by similarity
114
  clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
115
- print(f'Created {len(clusters)} clusters')
116
 
117
  # summarize all clusters into a single summary
118
  summary = self.summarize_clusters(clusters)
 
3
  from sklearn.cluster import HDBSCAN
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
6
+ import logging
7
 
8
 
9
  class Summarizer:
 
72
  # combine key sentences from each cluster
73
  key_sentences = []
74
  for i, cluster in enumerate(clusters):
75
+ logging.debug(f'Extracting from cluster {i + 1}...')
76
  top_sentences = Summarizer.rank_cluster_sentences(cluster)
77
  content = '\n'.join(top_sentences[:3])
78
  key_sentences.append(content)
 
80
  combined = ' '.join(key_sentences)
81
 
82
  # summarize all key sentences
83
+ logging.debug('Creating response...')
84
  summary = self.summarize(
85
  combined,
86
  min_length=60,
 
93
  if not articles:
94
  return None
95
 
96
+ logging.debug('Tokenizing into sentences...')
97
  # create a list of all sentences from all articles
98
  sentences = []
99
  for article in articles:
 
101
 
102
  # remove duplicate sentences
103
  sentences = sorted(list(set(sentences)), key=sentences.index)
104
+ logging.debug(f'Found {len(sentences)} unique sentences')
105
 
106
  if not sentences:
107
  return None
108
 
109
+ logging.debug('Creating sentence embeddings...')
110
  # create embeddings
111
  embeddings = self.create_embeddings(sentences)
112
 
113
+ logging.debug('Grouping sentences into clusters...')
114
  # group (embeddings of) sentences by similarity
115
  clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
116
+ logging.debug(f'Created {len(clusters)} clusters')
117
 
118
  # summarize all clusters into a single summary
119
  summary = self.summarize_clusters(clusters)
app/update_news.py CHANGED
@@ -4,6 +4,7 @@ from .summarizer import Summarizer
4
  from .news_fetcher import news_summary
5
  from .database import get_session, clear_articles, add_article, add_sources
6
  import datetime
 
7
 
8
 
9
  def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
@@ -17,7 +18,7 @@ def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
17
 
18
 
19
  def update_news():
20
- print(f'Initiating news update: {datetime.datetime.now()}')
21
 
22
  # model to create embeddings from sentences
23
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
4
  from .news_fetcher import news_summary
5
  from .database import get_session, clear_articles, add_article, add_sources
6
  import datetime
7
+ import logging
8
 
9
 
10
  def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
 
18
 
19
 
20
  def update_news():
21
+ logging.info(f'Initiating news update: {datetime.datetime.now()}')
22
 
23
  # model to create embeddings from sentences
24
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
test.py DELETED
@@ -1,4 +0,0 @@
1
- from database import DatabaseConnection
2
- db = DatabaseConnection()
3
- print(db.retrieve_articles())
4
- db.close()