michaelkri commited on
Commit
5d924ac
·
1 Parent(s): 469e535

Added Docker

Browse files
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // The name of the dev container
3
+ "name": "Focal",
4
+
5
+ // The Dockerfile to use for building the dev container
6
+ "dockerFile": "../Dockerfile",
7
+
8
+ // This is the key setting to prevent VS Code from overriding your CMD.
9
+ // Setting it to 'false' ensures your Dockerfile's CMD command is respected.
10
+ "overrideCommand": false,
11
+
12
+ // Any ports to forward from the container to your local machine
13
+ "forwardPorts": [7860],
14
+
15
+ // Extensions to install in the container
16
+ "customizations": {
17
+ "vscode": {
18
+ "extensions": [
19
+ "ms-python.python",
20
+ "ms-python.vscode-pylance"
21
+ ]
22
+ }
23
+ }
24
+ }
.dockerignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Include any files or directories that you don't want to be copied to your
2
+ # container here (e.g., local build artifacts, temporary files, etc.).
3
+ #
4
+ # For more help, visit the .dockerignore file reference guide at
5
+ # https://docs.docker.com/go/build-context-dockerignore/
6
+
7
+ **/.DS_Store
8
+ **/__pycache__
9
+ **/.venv
10
+ **/.classpath
11
+ **/.dockerignore
12
+ **/.env
13
+ **/.git
14
+ **/.gitignore
15
+ **/.project
16
+ **/.settings
17
+ **/.toolstarget
18
+ **/.vs
19
+ **/.vscode
20
+ **/*.*proj.user
21
+ **/*.dbmdl
22
+ **/*.jfm
23
+ **/bin
24
+ **/charts
25
+ **/docker-compose*
26
+ **/compose.y*ml
27
+ **/Dockerfile*
28
+ **/node_modules
29
+ **/npm-debug.log
30
+ **/obj
31
+ **/secrets.dev.yaml
32
+ **/values.dev.yaml
33
+ LICENSE
34
+ README.md
.github/dependabot.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for more information:
4
+ # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5
+ # https://containers.dev/guide/dependabot
6
+
7
+ version: 2
8
+ updates:
9
+ - package-ecosystem: "devcontainers"
10
+ directory: "/"
11
+ schedule:
12
+ interval: weekly
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- __pycache__
2
- venv
3
- .venv
4
  *.db
 
1
+ __pycache__
2
+ venv
3
+ .venv
4
  *.db
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.Docker.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Building and running your application
2
+
3
+ When you're ready, start your application by running:
4
+ `docker compose up --build`.
5
+
6
+ Your application will be available at http://localhost:7860.
7
+
8
+ ### Deploying your application to the cloud
9
+
10
+ First, build your image, e.g.: `docker build -t myapp .`.
11
+ If your cloud uses a different CPU architecture than your development
12
+ machine (e.g., you are on a Mac M1 and your cloud provider is amd64),
13
+ you'll want to build the image for that platform, e.g.:
14
+ `docker build --platform=linux/amd64 -t myapp .`.
15
+
16
+ Then, push it to your registry, e.g. `docker push myregistry.com/myapp`.
17
+
18
+ Consult Docker's [getting started](https://docs.docker.com/go/get-started-sharing/)
19
+ docs for more detail on building and pushing.
20
+
21
+ ### References
22
+ * [Docker's Python guide](https://docs.docker.com/language/python/)
app/database.py CHANGED
@@ -1,86 +1,86 @@
1
- from sqlalchemy import create_engine, UnicodeText, DateTime, ForeignKey
2
- from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session, joinedload, relationship
3
- from sqlalchemy.sql import func
4
- from contextlib import contextmanager
5
- import datetime
6
-
7
-
8
- class Base(DeclarativeBase):
9
- pass
10
-
11
-
12
- class Article(Base):
13
- __tablename__ = 'article'
14
-
15
- id: Mapped[int] = mapped_column(primary_key=True)
16
- title: Mapped[str] = mapped_column(UnicodeText)
17
- content: Mapped[str] = mapped_column(UnicodeText)
18
- date: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
19
-
20
- sources = relationship('Source', backref='article')
21
-
22
- def __repr__(self) -> str:
23
- return f'Article(id={self.id}, title={self.title}, content={self.content})'
24
-
25
-
26
- class Source(Base):
27
- '''
28
- Represents a source URL for an article.
29
- '''
30
- __tablename__ = 'source'
31
-
32
- id: Mapped[int] = mapped_column(primary_key=True)
33
- url: Mapped[str] = mapped_column(UnicodeText)
34
- article_id: Mapped[int] = mapped_column(ForeignKey('article.id'))
35
-
36
-
37
- # create an engine
38
- engine = create_engine('sqlite:///news.db', echo=True)
39
-
40
- # create tables if needed
41
- Base.metadata.create_all(engine)
42
-
43
-
44
- @contextmanager
45
- def get_session():
46
- '''
47
- Context manager for creating and closing a database session
48
- '''
49
- # create a session
50
- session = Session(engine)
51
- try:
52
- yield session
53
- finally:
54
- session.close()
55
-
56
-
57
- def add_article(session: Session, article: Article):
58
- '''
59
- Adds a new article to the database
60
- '''
61
- session.add(article)
62
- session.commit()
63
-
64
-
65
- def retrieve_articles(session: Session):
66
- '''
67
- Returns a list containing all articles
68
- '''
69
- return session.query(Article).options(joinedload(Article.sources)).all()
70
-
71
-
72
- def clear_articles(session: Session):
73
- '''
74
- Deletes all articles in the database
75
- '''
76
- session.query(Article).delete()
77
- session.commit()
78
-
79
-
80
- def add_sources(session: Session, sources: list[Source]):
81
- '''
82
- Adds the given sources to the database
83
- '''
84
- for source in sources:
85
- session.add(source)
86
  session.commit()
 
1
+ from sqlalchemy import create_engine, UnicodeText, DateTime, ForeignKey
2
+ from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session, joinedload, relationship
3
+ from sqlalchemy.sql import func
4
+ from contextlib import contextmanager
5
+ import datetime
6
+
7
+
8
+ class Base(DeclarativeBase):
9
+ pass
10
+
11
+
12
+ class Article(Base):
13
+ __tablename__ = 'article'
14
+
15
+ id: Mapped[int] = mapped_column(primary_key=True)
16
+ title: Mapped[str] = mapped_column(UnicodeText)
17
+ content: Mapped[str] = mapped_column(UnicodeText)
18
+ date: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
19
+
20
+ sources = relationship('Source', backref='article')
21
+
22
+ def __repr__(self) -> str:
23
+ return f'Article(id={self.id}, title={self.title}, content={self.content})'
24
+
25
+
26
+ class Source(Base):
27
+ '''
28
+ Represents a source URL for an article.
29
+ '''
30
+ __tablename__ = 'source'
31
+
32
+ id: Mapped[int] = mapped_column(primary_key=True)
33
+ url: Mapped[str] = mapped_column(UnicodeText)
34
+ article_id: Mapped[int] = mapped_column(ForeignKey('article.id'))
35
+
36
+
37
+ # create an engine
38
+ engine = create_engine('sqlite:///news.db', echo=True)
39
+
40
+ # create tables if needed
41
+ Base.metadata.create_all(engine)
42
+
43
+
44
+ @contextmanager
45
+ def get_session():
46
+ '''
47
+ Context manager for creating and closing a database session
48
+ '''
49
+ # create a session
50
+ session = Session(engine)
51
+ try:
52
+ yield session
53
+ finally:
54
+ session.close()
55
+
56
+
57
+ def add_article(session: Session, article: Article):
58
+ '''
59
+ Adds a new article to the database
60
+ '''
61
+ session.add(article)
62
+ session.commit()
63
+
64
+
65
+ def retrieve_articles(session: Session):
66
+ '''
67
+ Returns a list containing all articles
68
+ '''
69
+ return session.query(Article).options(joinedload(Article.sources)).all()
70
+
71
+
72
+ def clear_articles(session: Session):
73
+ '''
74
+ Deletes all articles in the database
75
+ '''
76
+ session.query(Article).delete()
77
+ session.commit()
78
+
79
+
80
+ def add_sources(session: Session, sources: list[Source]):
81
+ '''
82
+ Adds the given sources to the database
83
+ '''
84
+ for source in sources:
85
+ session.add(source)
86
  session.commit()
app/main.py CHANGED
@@ -1,71 +1,71 @@
1
- from fastapi import FastAPI, Request, BackgroundTasks
2
- from fastapi.templating import Jinja2Templates
3
- from contextlib import asynccontextmanager
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from .database import get_session, retrieve_articles
6
- from .update_news import update_news
7
- import threading
8
-
9
-
10
- # for updating the news feed periodically
11
- scheduler = BackgroundScheduler()
12
-
13
- # prevent initiating multiple concurrent updates
14
- is_updating = threading.Lock()
15
-
16
-
17
- @asynccontextmanager
18
- async def lifespan(app: FastAPI):
19
- # update news periodically
20
- scheduler.add_job(update_news, 'interval', minutes=30)
21
- scheduler.start()
22
- yield
23
- # stop the scheduler when closing the server
24
- scheduler.shutdown()
25
-
26
-
27
- app = FastAPI(lifespan=lifespan)
28
-
29
- templates = Jinja2Templates(directory='app/templates')
30
-
31
-
32
- def safe_update_news():
33
- '''
34
- Wrapper for update_news to ensure only one instance runs at a time.
35
- '''
36
- # check whether there is no update currently in progress
37
- if not is_updating.locked():
38
- # update news in background
39
- with is_updating:
40
- update_news()
41
-
42
-
43
- @app.get('/')
44
- async def read_root(request: Request, background_tasks: BackgroundTasks):
45
- # retrieve articles from database
46
- with get_session() as session:
47
- articles = retrieve_articles(session=session)
48
-
49
- # no articles yet
50
- if not articles:
51
- # update news in background
52
- background_tasks.add_task(safe_update_news)
53
-
54
- return templates.TemplateResponse(
55
- 'index.html',
56
- {
57
- 'request': request,
58
- 'articles': articles,
59
- 'message': 'Summarizing news articles now.'
60
- ' Please check back in a few minutes.'
61
- }
62
- )
63
-
64
- return templates.TemplateResponse(
65
- 'index.html',
66
- {
67
- 'request': request,
68
- 'articles': articles,
69
- 'message': 'Additional articles are being summarized right now.' if is_updating.locked() else None
70
- }
71
- )
 
1
+ from fastapi import FastAPI, Request, BackgroundTasks
2
+ from fastapi.templating import Jinja2Templates
3
+ from contextlib import asynccontextmanager
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from .database import get_session, retrieve_articles
6
+ from .update_news import update_news
7
+ import threading
8
+
9
+
10
+ # for updating the news feed periodically
11
+ scheduler = BackgroundScheduler()
12
+
13
+ # prevent initiating multiple concurrent updates
14
+ is_updating = threading.Lock()
15
+
16
+
17
+ @asynccontextmanager
18
+ async def lifespan(app: FastAPI):
19
+ # update news periodically
20
+ scheduler.add_job(update_news, 'interval', minutes=30)
21
+ scheduler.start()
22
+ yield
23
+ # stop the scheduler when closing the server
24
+ scheduler.shutdown()
25
+
26
+
27
+ app = FastAPI(lifespan=lifespan)
28
+
29
+ templates = Jinja2Templates(directory='app/templates')
30
+
31
+
32
+ def safe_update_news():
33
+ '''
34
+ Wrapper for update_news to ensure only one instance runs at a time.
35
+ '''
36
+ # check whether there is no update currently in progress
37
+ if not is_updating.locked():
38
+ # update news in background
39
+ with is_updating:
40
+ update_news()
41
+
42
+
43
+ @app.get('/')
44
+ async def read_root(request: Request, background_tasks: BackgroundTasks):
45
+ # retrieve articles from database
46
+ with get_session() as session:
47
+ articles = retrieve_articles(session=session)
48
+
49
+ # no articles yet
50
+ if not articles:
51
+ # update news in background
52
+ background_tasks.add_task(safe_update_news)
53
+
54
+ return templates.TemplateResponse(
55
+ 'index.html',
56
+ {
57
+ 'request': request,
58
+ 'articles': articles,
59
+ 'message': 'Summarizing news articles now.'
60
+ ' Please check back in a few minutes.'
61
+ }
62
+ )
63
+
64
+ return templates.TemplateResponse(
65
+ 'index.html',
66
+ {
67
+ 'request': request,
68
+ 'articles': articles,
69
+ 'message': 'Additional articles are being summarized right now.' if is_updating.locked() else None
70
+ }
71
+ )
app/news_fetcher.py CHANGED
@@ -1,56 +1,56 @@
1
- import feedparser
2
- from .summarizer import Summarizer
3
- from .scraper import get_articles
4
- from .database import Article, Source
5
-
6
-
7
- def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
8
- print(f'Beginning search for \'{query}\'...')
9
- articles_and_urls = get_articles(query, max_results=max_results)
10
- print(f'Retrieved {len(articles_and_urls)} articles')
11
- articles = [item[0] for item in articles_and_urls]
12
- urls = [item[1] for item in articles_and_urls]
13
- summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
14
- return summary, urls
15
-
16
-
17
- def fetch_headlines(summarizer: Summarizer, rss_feed_urls: list[str], max_articles=30) -> list[str]:
18
- # parse the RSS feeds
19
- feeds = [feedparser.parse(url) for url in rss_feed_urls]
20
-
21
- # extract max_article headlines from each feed
22
- headlines = [entry.title for feed in feeds for entry in feed.entries[:max_articles]]
23
-
24
- # create embeddings for each headline
25
- embeddings = summarizer.create_embeddings(headlines)
26
-
27
- # cluster headlines based on embeddings
28
- clusters = summarizer.cluster_sentences(headlines, embeddings)
29
-
30
- clustered_headlines = []
31
- for cluster in clusters:
32
- ranked_headlines = summarizer.rank_cluster_sentences(cluster)
33
-
34
- generic_headline = ranked_headlines[0]
35
-
36
- clustered_headlines.append(generic_headline)
37
-
38
- return clustered_headlines
39
-
40
-
41
- def news_summary(summarizer: Summarizer, rss_feed_urls: list[str]):
42
- # fetch current headlines
43
- headlines = fetch_headlines(summarizer, rss_feed_urls)
44
-
45
- # summarize each topic and yield articles
46
- for headline in headlines:
47
- summary, source_urls = topic_summary(summarizer, headline)
48
-
49
- article = Article(
50
- title=headline,
51
- content=summary
52
- )
53
-
54
- sources = [Source(url=s_url, article_id=article.id) for s_url in source_urls]
55
-
56
  yield article, sources
 
1
+ import feedparser
2
+ from .summarizer import Summarizer
3
+ from .scraper import get_articles
4
+ from .database import Article, Source
5
+
6
+
7
+ def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
8
+ print(f'Beginning search for \'{query}\'...')
9
+ articles_and_urls = get_articles(query, max_results=max_results)
10
+ print(f'Retrieved {len(articles_and_urls)} articles')
11
+ articles = [item[0] for item in articles_and_urls]
12
+ urls = [item[1] for item in articles_and_urls]
13
+ summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
14
+ return summary, urls
15
+
16
+
17
+ def fetch_headlines(summarizer: Summarizer, rss_feed_urls: list[str], max_articles=30) -> list[str]:
18
+ # parse the RSS feeds
19
+ feeds = [feedparser.parse(url) for url in rss_feed_urls]
20
+
21
+ # extract max_article headlines from each feed
22
+ headlines = [entry.title for feed in feeds for entry in feed.entries[:max_articles]]
23
+
24
+ # create embeddings for each headline
25
+ embeddings = summarizer.create_embeddings(headlines)
26
+
27
+ # cluster headlines based on embeddings
28
+ clusters = summarizer.cluster_sentences(headlines, embeddings)
29
+
30
+ clustered_headlines = []
31
+ for cluster in clusters:
32
+ ranked_headlines = summarizer.rank_cluster_sentences(cluster)
33
+
34
+ generic_headline = ranked_headlines[0]
35
+
36
+ clustered_headlines.append(generic_headline)
37
+
38
+ return clustered_headlines
39
+
40
+
41
+ def news_summary(summarizer: Summarizer, rss_feed_urls: list[str]):
42
+ # fetch current headlines
43
+ headlines = fetch_headlines(summarizer, rss_feed_urls)
44
+
45
+ # summarize each topic and yield articles
46
+ for headline in headlines:
47
+ summary, source_urls = topic_summary(summarizer, headline)
48
+
49
+ article = Article(
50
+ title=headline,
51
+ content=summary
52
+ )
53
+
54
+ sources = [Source(url=s_url, article_id=article.id) for s_url in source_urls]
55
+
56
  yield article, sources
app/scraper.py CHANGED
@@ -1,28 +1,28 @@
1
- from ddgs import DDGS
2
- import trafilatura
3
-
4
-
5
- def retrieve_article(url: str) -> str:
6
- try:
7
- page = trafilatura.fetch_url(url)
8
- return trafilatura.extract(page)
9
- except Exception as e:
10
- print(e.args)
11
- return None
12
-
13
-
14
- def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
15
- # search DuckDuckGo for news on query
16
- search_results = []
17
- try:
18
- search_results = DDGS().news(query, timelimit='d', max_results=max_results)
19
- except Exception as e:
20
- print(e.args)
21
-
22
- # get article urls
23
- urls = set([r['url'] for r in search_results])
24
-
25
- # try to retrieve articles (filter inaccessible and short articles)
26
- articles = [(article, url) for url in urls if (article := retrieve_article(url)) is not None and len(article) > min_article_length]
27
-
28
  return articles
 
1
+ from ddgs import DDGS
2
+ import trafilatura
3
+
4
+
5
+ def retrieve_article(url: str) -> str:
6
+ try:
7
+ page = trafilatura.fetch_url(url)
8
+ return trafilatura.extract(page)
9
+ except Exception as e:
10
+ print(e.args)
11
+ return None
12
+
13
+
14
+ def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
15
+ # search DuckDuckGo for news on query
16
+ search_results = []
17
+ try:
18
+ search_results = DDGS().news(query, timelimit='d', max_results=max_results)
19
+ except Exception as e:
20
+ print(e.args)
21
+
22
+ # get article urls
23
+ urls = set([r['url'] for r in search_results])
24
+
25
+ # try to retrieve articles (filter inaccessible and short articles)
26
+ articles = [(article, url) for url in urls if (article := retrieve_article(url)) is not None and len(article) > min_article_length]
27
+
28
  return articles
app/summarizer.py CHANGED
@@ -1,120 +1,120 @@
1
- import nltk
2
- from nltk import tokenize
3
- from sklearn.cluster import HDBSCAN
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- import numpy as np
6
-
7
-
8
- class Summarizer:
9
- def __init__(self, embedding_model, summarization_model):
10
- nltk.download('punkt_tab') # for tokenizing into sentences
11
-
12
- self.embedding_model = embedding_model
13
- self.summarization_model = summarization_model
14
-
15
-
16
- def cluster_sentences(self, sentences, embeddings, min_cluster_size=2):
17
- hdb = HDBSCAN(min_cluster_size=min_cluster_size).fit(embeddings)
18
-
19
- clusters = {}
20
- for i, sentence in enumerate(sentences):
21
- cluster_id = hdb.labels_[i]
22
- if cluster_id == -1: # discard "noise"
23
- continue
24
- if cluster_id not in clusters:
25
- clusters[cluster_id] = []
26
- clusters[cluster_id].append((sentence, embeddings[i]))
27
-
28
- return list(clusters.values())
29
-
30
-
31
- def create_embeddings(self, sentences):
32
- return self.embedding_model.encode(sentences)
33
-
34
-
35
- def summarize(self, content, min_length=30, max_length=130):
36
- # truncate content if exceeds model capabilities
37
- max_model_length = self.summarization_model.model.config.max_position_embeddings
38
- if len(content) > max_model_length:
39
- content = content[:max_model_length]
40
-
41
- max_length = max(min_length, min(len(content), max_length))
42
-
43
- return self.summarization_model(
44
- content,
45
- min_length=min_length,
46
- max_length=max_length,
47
- do_sample=False)[0]['summary_text']
48
-
49
-
50
- @staticmethod
51
- def rank_cluster_sentences(cluster):
52
- # separate sentences and embeddings
53
- sentences = [entry[0] for entry in cluster]
54
- embeddings = [entry[1] for entry in cluster]
55
- # find center of cluster
56
- center = np.mean(embeddings, axis=0)
57
- # score sentences by similarity to center
58
- scores = cosine_similarity([center], embeddings)[0]
59
- # rank sentences by score
60
- ranked_sentences = [sentence for _, sentence in sorted(zip(scores, sentences), reverse=True)]
61
- return ranked_sentences
62
-
63
-
64
- def summarize_clusters(self, clusters, top_cluster_count=5):
65
- # sort clusters by their length (descending) to find the most important topics
66
- clusters = sorted(clusters, key=len, reverse=True)
67
-
68
- # take only the top_cluster_count clusters
69
- clusters = clusters[:top_cluster_count]
70
-
71
- # combine key sentences from each cluster
72
- key_sentences = []
73
- for i, cluster in enumerate(clusters):
74
- print(f'Extracting from cluster {i + 1}...')
75
- top_sentences = Summarizer.rank_cluster_sentences(cluster)
76
- content = '\n'.join(top_sentences[:3])
77
- key_sentences.append(content)
78
-
79
- combined = ' '.join(key_sentences)
80
-
81
- # summarize all key sentences
82
- print('Creating response...')
83
- summary = self.summarize(
84
- combined,
85
- min_length=60,
86
- max_length=250
87
- )
88
- return summary
89
-
90
-
91
- def combined_summary(self, articles, min_cluster_size=3):
92
- if not articles:
93
- return 'No articles to summarize.'
94
-
95
- print('Tokenizing into sentences...')
96
- # create a list of all sentences from all articles
97
- sentences = []
98
- for article in articles:
99
- sentences.extend(tokenize.sent_tokenize(str.strip(article)))
100
-
101
- # remove duplicate sentences
102
- sentences = sorted(list(set(sentences)), key=sentences.index)
103
- print(f'Found {len(sentences)} unique sentences')
104
-
105
- if not sentences:
106
- return None
107
-
108
- print('Creating sentence embeddings...')
109
- # create embeddings
110
- embeddings = self.create_embeddings(sentences)
111
-
112
- print('Grouping sentences into clusters...')
113
- # group (embeddings of) sentences by similarity
114
- clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
115
- print(f'Created {len(clusters)} clusters')
116
-
117
- # summarize all clusters into a single summary
118
- summary = self.summarize_clusters(clusters)
119
-
120
  return summary
 
1
+ import nltk
2
+ from nltk import tokenize
3
+ from sklearn.cluster import HDBSCAN
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import numpy as np
6
+
7
+
8
+ class Summarizer:
9
+ def __init__(self, embedding_model, summarization_model):
10
+ nltk.download('punkt_tab') # for tokenizing into sentences
11
+
12
+ self.embedding_model = embedding_model
13
+ self.summarization_model = summarization_model
14
+
15
+
16
+ def cluster_sentences(self, sentences, embeddings, min_cluster_size=2):
17
+ hdb = HDBSCAN(min_cluster_size=min_cluster_size).fit(embeddings)
18
+
19
+ clusters = {}
20
+ for i, sentence in enumerate(sentences):
21
+ cluster_id = hdb.labels_[i]
22
+ if cluster_id == -1: # discard "noise"
23
+ continue
24
+ if cluster_id not in clusters:
25
+ clusters[cluster_id] = []
26
+ clusters[cluster_id].append((sentence, embeddings[i]))
27
+
28
+ return list(clusters.values())
29
+
30
+
31
+ def create_embeddings(self, sentences):
32
+ return self.embedding_model.encode(sentences)
33
+
34
+
35
+ def summarize(self, content, min_length=30, max_length=130):
36
+ # truncate content if exceeds model capabilities
37
+ max_model_length = self.summarization_model.model.config.max_position_embeddings
38
+ if len(content) > max_model_length:
39
+ content = content[:max_model_length]
40
+
41
+ max_length = max(min_length, min(len(content), max_length))
42
+
43
+ return self.summarization_model(
44
+ content,
45
+ min_length=min_length,
46
+ max_length=max_length,
47
+ do_sample=False)[0]['summary_text']
48
+
49
+
50
+ @staticmethod
51
+ def rank_cluster_sentences(cluster):
52
+ # separate sentences and embeddings
53
+ sentences = [entry[0] for entry in cluster]
54
+ embeddings = [entry[1] for entry in cluster]
55
+ # find center of cluster
56
+ center = np.mean(embeddings, axis=0)
57
+ # score sentences by similarity to center
58
+ scores = cosine_similarity([center], embeddings)[0]
59
+ # rank sentences by score
60
+ ranked_sentences = [sentence for _, sentence in sorted(zip(scores, sentences), reverse=True)]
61
+ return ranked_sentences
62
+
63
+
64
+ def summarize_clusters(self, clusters, top_cluster_count=5):
65
+ # sort clusters by their length (descending) to find the most important topics
66
+ clusters = sorted(clusters, key=len, reverse=True)
67
+
68
+ # take only the top_cluster_count clusters
69
+ clusters = clusters[:top_cluster_count]
70
+
71
+ # combine key sentences from each cluster
72
+ key_sentences = []
73
+ for i, cluster in enumerate(clusters):
74
+ print(f'Extracting from cluster {i + 1}...')
75
+ top_sentences = Summarizer.rank_cluster_sentences(cluster)
76
+ content = '\n'.join(top_sentences[:3])
77
+ key_sentences.append(content)
78
+
79
+ combined = ' '.join(key_sentences)
80
+
81
+ # summarize all key sentences
82
+ print('Creating response...')
83
+ summary = self.summarize(
84
+ combined,
85
+ min_length=60,
86
+ max_length=250
87
+ )
88
+ return summary
89
+
90
+
91
+ def combined_summary(self, articles, min_cluster_size=3):
92
+ if not articles:
93
+ return 'No articles to summarize.'
94
+
95
+ print('Tokenizing into sentences...')
96
+ # create a list of all sentences from all articles
97
+ sentences = []
98
+ for article in articles:
99
+ sentences.extend(tokenize.sent_tokenize(str.strip(article)))
100
+
101
+ # remove duplicate sentences
102
+ sentences = sorted(list(set(sentences)), key=sentences.index)
103
+ print(f'Found {len(sentences)} unique sentences')
104
+
105
+ if not sentences:
106
+ return None
107
+
108
+ print('Creating sentence embeddings...')
109
+ # create embeddings
110
+ embeddings = self.create_embeddings(sentences)
111
+
112
+ print('Grouping sentences into clusters...')
113
+ # group (embeddings of) sentences by similarity
114
+ clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
115
+ print(f'Created {len(clusters)} clusters')
116
+
117
+ # summarize all clusters into a single summary
118
+ summary = self.summarize_clusters(clusters)
119
+
120
  return summary
app/templates/index.html CHANGED
@@ -1,123 +1,123 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Focal</title>
7
- <script src="https://cdn.tailwindcss.com"></script>
8
- <link rel="preconnect" href="https://fonts.googleapis.com">
9
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
- <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@600&family=Inter:wght@400;500&display=swap" rel="stylesheet">
11
- <script>
12
- tailwind.config = {
13
- theme: {
14
- extend: {
15
- fontFamily: {
16
- sans: ['Inter', 'sans-serif'],
17
- display: ['Poppins', 'sans-serif'],
18
- },
19
- colors: {
20
- 'background': '#FFFFFF',
21
- 'background-light': '#F9F9F9',
22
- 'text-primary': '#121212',
23
- 'primary': '#2563EB',
24
- 'primary-dark': '#1E40AF',
25
- }
26
- }
27
- }
28
- }
29
- </script>
30
- <style>
31
- .collapsible-content {
32
- max-height: 0;
33
- overflow: hidden;
34
- transition: max-height 0.5s cubic-bezier(0.4, 0, 0.2, 1);
35
- }
36
- .collapsible-content.expanded {
37
- max-height: 500px;
38
- }
39
- .collapsible-content .border {
40
- border-color: #E5E7EB;
41
- }
42
- </style>
43
- </head>
44
- <body class="bg-background text-text-primary font-sans antialiased">
45
-
46
- <header class="container mx-auto px-6 py-5">
47
- <div class="flex items-center border-b border-primary/20 pb-3">
48
- <svg class="h-8 w-8 mr-2" viewBox="0 0 32 32" fill="black" xmlns="http://www.w3.org/2000/svg">
49
- <circle cx="16" cy="16" r="10" />
50
- </svg>
51
- <a href="#" class="font-display text-2xl tracking-widest uppercase">Focal</a>
52
- </div>
53
- </header>
54
-
55
- <main class="container mx-auto px-6 py-12 max-w-4xl">
56
-
57
- {% if message %}
58
- <div class="text-center text-lg text-text-primary">
59
- {{ message }}
60
- </div>
61
- {% endif %}
62
-
63
- {% for article in articles %}
64
- <article class="mb-20 md:mb-24">
65
- <p class="text-sm font-medium text-primary uppercase tracking-wider mb-3">{{ article.date }}</p>
66
- <h1 class="font-display font-semibold text-2xl md:text-3xl text-text-primary mb-6 leading-tight">
67
- {{ article.title }}
68
- </h1>
69
- <div class="text-lg text-text-primary/80 leading-relaxed space-y-5">
70
- <p>
71
- {{ article.content }}
72
- </p>
73
- </div>
74
- {% if article.sources %}
75
- <div class="mt-8">
76
- <button class="toggle-button flex items-center text-primary font-medium text-sm transition-colors hover:text-text-primary">
77
- <span class="mr-2">Further Reading</span>
78
- <svg class="plus-icon h-4 w-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M12 4.5v15m7.5-7.5h-15" /></svg>
79
- <svg class="minus-icon h-4 w-4 hidden" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M19.5 12h-15" /></svg>
80
- </button>
81
-
82
- <div class="collapsible-content mt-4">
83
- <div class="bg-background-light p-5 rounded-md border border-gray-700/50">
84
- <ul class="space-y-3 list-disc list-inside text-text-primary/70">
85
- {% for source in article.sources %}
86
- <li><a href="{{ source.url }}" class="hover:text-primary transition-colors">{{ source.url }}</a></li>
87
- {% endfor %}
88
- </ul>
89
- </div>
90
- </div>
91
- </div>
92
- {% endif %}
93
- </article>
94
- {% endfor %}
95
-
96
- </main>
97
-
98
- <footer class="container mx-auto px-6 py-4 text-center text-sm text-text-primary/60">
99
- <p>All content is AI-generated, and may contain inaccuracies.</p>
100
- </footer>
101
-
102
- <script>
103
- // JavaScript for the collapsible sections
104
- document.addEventListener('DOMContentLoaded', function () {
105
- const allToggleButtons = document.querySelectorAll('.toggle-button');
106
-
107
- allToggleButtons.forEach(button => {
108
- button.addEventListener('click', function () {
109
- const content = this.nextElementSibling;
110
- const plusIcon = this.querySelector('.plus-icon');
111
- const minusIcon = this.querySelector('.minus-icon');
112
-
113
- content.classList.toggle('expanded');
114
-
115
- plusIcon.classList.toggle('hidden');
116
- minusIcon.classList.toggle('hidden');
117
- });
118
- });
119
- });
120
- </script>
121
-
122
- </body>
123
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Focal</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <link rel="preconnect" href="https://fonts.googleapis.com">
9
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
+ <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@600&family=Inter:wght@400;500&display=swap" rel="stylesheet">
11
+ <script>
12
+ tailwind.config = {
13
+ theme: {
14
+ extend: {
15
+ fontFamily: {
16
+ sans: ['Inter', 'sans-serif'],
17
+ display: ['Poppins', 'sans-serif'],
18
+ },
19
+ colors: {
20
+ 'background': '#FFFFFF',
21
+ 'background-light': '#F9F9F9',
22
+ 'text-primary': '#121212',
23
+ 'primary': '#2563EB',
24
+ 'primary-dark': '#1E40AF',
25
+ }
26
+ }
27
+ }
28
+ }
29
+ </script>
30
+ <style>
31
+ .collapsible-content {
32
+ max-height: 0;
33
+ overflow: hidden;
34
+ transition: max-height 0.5s cubic-bezier(0.4, 0, 0.2, 1);
35
+ }
36
+ .collapsible-content.expanded {
37
+ max-height: 500px;
38
+ }
39
+ .collapsible-content .border {
40
+ border-color: #E5E7EB;
41
+ }
42
+ </style>
43
+ </head>
44
+ <body class="bg-background text-text-primary font-sans antialiased">
45
+
46
+ <header class="container mx-auto px-6 py-5">
47
+ <div class="flex items-center border-b border-primary/20 pb-3">
48
+ <svg class="h-8 w-8 mr-2" viewBox="0 0 32 32" fill="black" xmlns="http://www.w3.org/2000/svg">
49
+ <circle cx="16" cy="16" r="10" />
50
+ </svg>
51
+ <a href="#" class="font-display text-2xl tracking-widest uppercase">Focal</a>
52
+ </div>
53
+ </header>
54
+
55
+ <main class="container mx-auto px-6 py-12 max-w-4xl">
56
+
57
+ {% if message %}
58
+ <div class="text-center text-lg text-text-primary">
59
+ {{ message }}
60
+ </div>
61
+ {% endif %}
62
+
63
+ {% for article in articles %}
64
+ <article class="mb-20 md:mb-24">
65
+ <p class="text-sm font-medium text-primary uppercase tracking-wider mb-3">{{ article.date }}</p>
66
+ <h1 class="font-display font-semibold text-2xl md:text-3xl text-text-primary mb-6 leading-tight">
67
+ {{ article.title }}
68
+ </h1>
69
+ <div class="text-lg text-text-primary/80 leading-relaxed space-y-5">
70
+ <p>
71
+ {{ article.content }}
72
+ </p>
73
+ </div>
74
+ {% if article.sources %}
75
+ <div class="mt-8">
76
+ <button class="toggle-button flex items-center text-primary font-medium text-sm transition-colors hover:text-text-primary">
77
+ <span class="mr-2">Further Reading</span>
78
+ <svg class="plus-icon h-4 w-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M12 4.5v15m7.5-7.5h-15" /></svg>
79
+ <svg class="minus-icon h-4 w-4 hidden" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M19.5 12h-15" /></svg>
80
+ </button>
81
+
82
+ <div class="collapsible-content mt-4">
83
+ <div class="bg-background-light p-5 rounded-md border border-gray-700/50">
84
+ <ul class="space-y-3 list-disc list-inside text-text-primary/70">
85
+ {% for source in article.sources %}
86
+ <li><a href="{{ source.url }}" class="hover:text-primary transition-colors">{{ source.url }}</a></li>
87
+ {% endfor %}
88
+ </ul>
89
+ </div>
90
+ </div>
91
+ </div>
92
+ {% endif %}
93
+ </article>
94
+ {% endfor %}
95
+
96
+ </main>
97
+
98
+ <footer class="container mx-auto px-6 py-4 text-center text-sm text-text-primary/60">
99
+ <p>All content is AI-generated, and may contain inaccuracies.</p>
100
+ </footer>
101
+
102
+ <script>
103
+ // JavaScript for the collapsible sections
104
+ document.addEventListener('DOMContentLoaded', function () {
105
+ const allToggleButtons = document.querySelectorAll('.toggle-button');
106
+
107
+ allToggleButtons.forEach(button => {
108
+ button.addEventListener('click', function () {
109
+ const content = this.nextElementSibling;
110
+ const plusIcon = this.querySelector('.plus-icon');
111
+ const minusIcon = this.querySelector('.minus-icon');
112
+
113
+ content.classList.toggle('expanded');
114
+
115
+ plusIcon.classList.toggle('hidden');
116
+ minusIcon.classList.toggle('hidden');
117
+ });
118
+ });
119
+ });
120
+ </script>
121
+
122
+ </body>
123
  </html>
app/update_news.py CHANGED
@@ -1,50 +1,50 @@
1
- from sentence_transformers import SentenceTransformer
2
- from transformers import pipeline
3
- from .summarizer import Summarizer
4
- from .news_fetcher import news_summary
5
- from .database import get_session, clear_articles, add_article, add_sources
6
- import datetime
7
-
8
-
9
- def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
10
- try:
11
- with open(filename, 'r') as f:
12
- urls = [url.strip() for url in f.readlines()]
13
- return urls
14
- except:
15
- return None
16
-
17
-
18
-
19
- def update_news():
20
- print(f'Initiating news update: {datetime.datetime.now()}')
21
-
22
- # model to create embeddings from sentences
23
- embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
24
-
25
- # model to summarize text
26
- summarizing_model = pipeline("summarization", model="facebook/bart-large-cnn")
27
-
28
- # wraps models
29
- summarizer = Summarizer(embedding_model=embedding_model, summarization_model=summarizing_model)
30
-
31
- # rss feeds for current headlines
32
- rss_feed_urls = read_rss_feed_urls()
33
-
34
- # clear old articles
35
- with get_session() as session:
36
- clear_articles(session)
37
-
38
- # summarize articles and add to database
39
- for article, sources in news_summary(summarizer, rss_feed_urls):
40
- with get_session() as session:
41
- add_article(session, article)
42
-
43
- for source in sources:
44
- source.article_id = article.id
45
-
46
- add_sources(session, sources)
47
-
48
-
49
- if __name__ == '__main__':
50
- update_news()
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from transformers import pipeline
3
+ from .summarizer import Summarizer
4
+ from .news_fetcher import news_summary
5
+ from .database import get_session, clear_articles, add_article, add_sources
6
+ import datetime
7
+
8
+
9
+ def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
10
+ try:
11
+ with open(filename, 'r') as f:
12
+ urls = [url.strip() for url in f.readlines()]
13
+ return urls
14
+ except:
15
+ return None
16
+
17
+
18
+
19
+ def update_news():
20
+ print(f'Initiating news update: {datetime.datetime.now()}')
21
+
22
+ # model to create embeddings from sentences
23
+ embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
24
+
25
+ # model to summarize text
26
+ summarizing_model = pipeline("summarization", model="facebook/bart-large-cnn")
27
+
28
+ # wraps models
29
+ summarizer = Summarizer(embedding_model=embedding_model, summarization_model=summarizing_model)
30
+
31
+ # rss feeds for current headlines
32
+ rss_feed_urls = read_rss_feed_urls()
33
+
34
+ # clear old articles
35
+ with get_session() as session:
36
+ clear_articles(session)
37
+
38
+ # summarize articles and add to database
39
+ for article, sources in news_summary(summarizer, rss_feed_urls):
40
+ with get_session() as session:
41
+ add_article(session, article)
42
+
43
+ for source in sources:
44
+ source.article_id = article.id
45
+
46
+ add_sources(session, sources)
47
+
48
+
49
+ if __name__ == '__main__':
50
+ update_news()
compose.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Comments are provided throughout this file to help you get started.
2
+ # If you need more help, visit the Docker Compose reference guide at
3
+ # https://docs.docker.com/go/compose-spec-reference/
4
+
5
+ # Here the instructions define your application as a service called "server".
6
+ # This service is built from the Dockerfile in the current directory.
7
+ # You can add other services your application may depend on here, such as a
8
+ # database or a cache. For examples, see the Awesome Compose repository:
9
+ # https://github.com/docker/awesome-compose
10
+ services:
11
+ server:
12
+ build:
13
+ context: .
14
+ ports:
15
+ - 7860:7860
16
+
17
+ # The commented out section below is an example of how to define a PostgreSQL
18
+ # database that your application can use. `depends_on` tells Docker Compose to
19
+ # start the database before your application. The `db-data` volume persists the
20
+ # database data between container restarts. The `db-password` secret is used
21
+ # to set the database password. You must create `db/password.txt` and add
22
+ # a password of your choosing to it before running `docker compose up`.
23
+ # depends_on:
24
+ # db:
25
+ # condition: service_healthy
26
+ # db:
27
+ # image: postgres
28
+ # restart: always
29
+ # user: postgres
30
+ # secrets:
31
+ # - db-password
32
+ # volumes:
33
+ # - db-data:/var/lib/postgresql/data
34
+ # environment:
35
+ # - POSTGRES_DB=example
36
+ # - POSTGRES_PASSWORD_FILE=/run/secrets/db-password
37
+ # expose:
38
+ # - 5432
39
+ # healthcheck:
40
+ # test: [ "CMD", "pg_isready" ]
41
+ # interval: 10s
42
+ # timeout: 5s
43
+ # retries: 5
44
+ # volumes:
45
+ # db-data:
46
+ # secrets:
47
+ # db-password:
48
+ # file: db/password.txt
49
+
rss_feeds.txt CHANGED
@@ -1,4 +1,4 @@
1
- http://feeds.bbci.co.uk/news/world/rss.xml
2
- http://rss.cnn.com/rss/cnn_topstories.rss
3
- http://feeds.nytimes.com/nyt/rss/HomePage
4
  https://www.theguardian.com/world/rss
 
1
+ http://feeds.bbci.co.uk/news/world/rss.xml
2
+ http://rss.cnn.com/rss/cnn_topstories.rss
3
+ http://feeds.nytimes.com/nyt/rss/HomePage
4
  https://www.theguardian.com/world/rss
test.py CHANGED
@@ -1,4 +1,4 @@
1
- from database import DatabaseConnection
2
- db = DatabaseConnection()
3
- print(db.retrieve_articles())
4
  db.close()
 
1
+ from database import DatabaseConnection
2
+ db = DatabaseConnection()
3
+ print(db.retrieve_articles())
4
  db.close()