Spaces:

michaelkri
/

focal

Running

App Files Files Community

michaelkri commited on Sep 24, 2025

Commit

5d924ac

1 Parent(s): 469e535

Added Docker

Browse files

Files changed (16) hide show

.devcontainer/devcontainer.json +24 -0
.dockerignore +34 -0
.github/dependabot.yml +12 -0
.gitignore +3 -3
Dockerfile +13 -0
README.Docker.md +22 -0
app/database.py +85 -85
app/main.py +71 -71
app/news_fetcher.py +55 -55
app/scraper.py +27 -27
app/summarizer.py +119 -119
app/templates/index.html +122 -122
app/update_news.py +50 -50
compose.yaml +49 -0
rss_feeds.txt +3 -3
test.py +3 -3

.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    // The name of the dev container
+    "name": "Focal",
+    // The Dockerfile to use for building the dev container
+    "dockerFile": "../Dockerfile",
+    // This is the key setting to prevent VS Code from overriding your CMD.
+    // Setting it to 'false' ensures your Dockerfile's CMD command is respected.
+    "overrideCommand": false,
+    // Any ports to forward from the container to your local machine
+    "forwardPorts": [7860],
+    // Extensions to install in the container
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.vscode-pylance"
+            ]
+        }
+    }
+}

.dockerignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Include any files or directories that you don't want to be copied to your
+# container here (e.g., local build artifacts, temporary files, etc.).
+#
+# For more help, visit the .dockerignore file reference guide at
+# https://docs.docker.com/go/build-context-dockerignore/
+**/.DS_Store
+**/__pycache__
+**/.venv
+**/.classpath
+**/.dockerignore
+**/.env
+**/.git
+**/.gitignore
+**/.project
+**/.settings
+**/.toolstarget
+**/.vs
+**/.vscode
+**/*.*proj.user
+**/*.dbmdl
+**/*.jfm
+**/bin
+**/charts
+**/docker-compose*
+**/compose.y*ml
+**/Dockerfile*
+**/node_modules
+**/npm-debug.log
+**/obj
+**/secrets.dev.yaml
+**/values.dev.yaml
+LICENSE
+README.md

.github/dependabot.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for more information:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+# https://containers.dev/guide/dependabot
+version: 2
+updates:
+ - package-ecosystem: "devcontainers"
+   directory: "/"
+   schedule:
+     interval: weekly

.gitignore CHANGED Viewed

@@ -1,4 +1,4 @@
-__pycache__
-venv
-.venv
 *.db

+__pycache__
+venv
+.venv
 *.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.Docker.md ADDED Viewed

	@@ -0,0 +1,22 @@

+### Building and running your application
+When you're ready, start your application by running:
+`docker compose up --build`.
+Your application will be available at http://localhost:7860.
+### Deploying your application to the cloud
+First, build your image, e.g.: `docker build -t myapp .`.
+If your cloud uses a different CPU architecture than your development
+machine (e.g., you are on a Mac M1 and your cloud provider is amd64),
+you'll want to build the image for that platform, e.g.:
+`docker build --platform=linux/amd64 -t myapp .`.
+Then, push it to your registry, e.g. `docker push myregistry.com/myapp`.
+Consult Docker's [getting started](https://docs.docker.com/go/get-started-sharing/)
+docs for more detail on building and pushing.
+### References
+* [Docker's Python guide](https://docs.docker.com/language/python/)

app/database.py CHANGED Viewed

@@ -1,86 +1,86 @@
-from sqlalchemy import create_engine, UnicodeText, DateTime, ForeignKey
-from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session, joinedload, relationship
-from sqlalchemy.sql import func
-from contextlib import contextmanager
-import datetime
-class Base(DeclarativeBase):
-    pass
-class Article(Base):
-    __tablename__ = 'article'
-    id: Mapped[int] = mapped_column(primary_key=True)
-    title: Mapped[str] = mapped_column(UnicodeText)
-    content: Mapped[str] = mapped_column(UnicodeText)
-    date: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
-    sources = relationship('Source', backref='article')
-    def __repr__(self) -> str:
-        return f'Article(id={self.id}, title={self.title}, content={self.content})'
-class Source(Base):
-    '''
-    Represents a source URL for an article.
-    '''
-    __tablename__ = 'source'
-    id: Mapped[int] = mapped_column(primary_key=True)
-    url: Mapped[str] = mapped_column(UnicodeText)
-    article_id: Mapped[int] = mapped_column(ForeignKey('article.id'))
-# create an engine
-engine = create_engine('sqlite:///news.db', echo=True)
-# create tables if needed
-Base.metadata.create_all(engine)
-@contextmanager
-def get_session():
-    '''
-    Context manager for creating and closing a database session
-    '''
-    # create a session
-    session = Session(engine)
-    try:
-        yield session
-    finally:
-        session.close()
-def add_article(session: Session, article: Article):
-    '''
-    Adds a new article to the database
-    '''
-    session.add(article)
-    session.commit()
-def retrieve_articles(session: Session):
-    '''
-    Returns a list containing all articles
-    '''
-    return session.query(Article).options(joinedload(Article.sources)).all()
-def clear_articles(session: Session):
-    '''
-    Deletes all articles in the database
-    '''
-    session.query(Article).delete()
-    session.commit()
-def add_sources(session: Session, sources: list[Source]):
-    '''
-    Adds the given sources to the database
-    '''
-    for source in sources:
-        session.add(source)
     session.commit()

+from sqlalchemy import create_engine, UnicodeText, DateTime, ForeignKey
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session, joinedload, relationship
+from sqlalchemy.sql import func
+from contextlib import contextmanager
+import datetime
+class Base(DeclarativeBase):
+    pass
+class Article(Base):
+    __tablename__ = 'article'
+    id: Mapped[int] = mapped_column(primary_key=True)
+    title: Mapped[str] = mapped_column(UnicodeText)
+    content: Mapped[str] = mapped_column(UnicodeText)
+    date: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
+    sources = relationship('Source', backref='article')
+    def __repr__(self) -> str:
+        return f'Article(id={self.id}, title={self.title}, content={self.content})'
+class Source(Base):
+    '''
+    Represents a source URL for an article.
+    '''
+    __tablename__ = 'source'
+    id: Mapped[int] = mapped_column(primary_key=True)
+    url: Mapped[str] = mapped_column(UnicodeText)
+    article_id: Mapped[int] = mapped_column(ForeignKey('article.id'))
+# create an engine
+engine = create_engine('sqlite:///news.db', echo=True)
+# create tables if needed
+Base.metadata.create_all(engine)
+@contextmanager
+def get_session():
+    '''
+    Context manager for creating and closing a database session
+    '''
+    # create a session
+    session = Session(engine)
+    try:
+        yield session
+    finally:
+        session.close()
+def add_article(session: Session, article: Article):
+    '''
+    Adds a new article to the database
+    '''
+    session.add(article)
+    session.commit()
+def retrieve_articles(session: Session):
+    '''
+    Returns a list containing all articles
+    '''
+    return session.query(Article).options(joinedload(Article.sources)).all()
+def clear_articles(session: Session):
+    '''
+    Deletes all articles in the database
+    '''
+    session.query(Article).delete()
+    session.commit()
+def add_sources(session: Session, sources: list[Source]):
+    '''
+    Adds the given sources to the database
+    '''
+    for source in sources:
+        session.add(source)
     session.commit()

app/main.py CHANGED Viewed

@@ -1,71 +1,71 @@
-from fastapi import FastAPI, Request, BackgroundTasks
-from fastapi.templating import Jinja2Templates
-from contextlib import asynccontextmanager
-from apscheduler.schedulers.background import BackgroundScheduler
-from .database import get_session, retrieve_articles
-from .update_news import update_news
-import threading
-# for updating the news feed periodically
-scheduler = BackgroundScheduler()
-# prevent initiating multiple concurrent updates
-is_updating = threading.Lock()
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    # update news periodically
-    scheduler.add_job(update_news, 'interval', minutes=30)
-    scheduler.start()
-    yield
-    # stop the scheduler when closing the server
-    scheduler.shutdown()
-app = FastAPI(lifespan=lifespan)
-templates = Jinja2Templates(directory='app/templates')
-def safe_update_news():
-    '''
-    Wrapper for update_news to ensure only one instance runs at a time.
-    '''
-    # check whether there is no update currently in progress
-    if not is_updating.locked():
-        # update news in background
-        with is_updating:
-            update_news()
-@app.get('/')
-async def read_root(request: Request, background_tasks: BackgroundTasks):
-    # retrieve articles from database
-    with get_session() as session:
-        articles = retrieve_articles(session=session)
-    # no articles yet
-    if not articles:
-        # update news in background
-        background_tasks.add_task(safe_update_news)
-        return templates.TemplateResponse(
-            'index.html',
-            {
-                'request': request,
-                'articles': articles,
-                'message': 'Summarizing news articles now.'
-                           ' Please check back in a few minutes.'
-            }
-        )
-    return templates.TemplateResponse(
-        'index.html',
-        {
-            'request': request,
-            'articles': articles,
-            'message': 'Additional articles are being summarized right now.' if is_updating.locked() else None
-        }
-    )

+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.templating import Jinja2Templates
+from contextlib import asynccontextmanager
+from apscheduler.schedulers.background import BackgroundScheduler
+from .database import get_session, retrieve_articles
+from .update_news import update_news
+import threading
+# for updating the news feed periodically
+scheduler = BackgroundScheduler()
+# prevent initiating multiple concurrent updates
+is_updating = threading.Lock()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # update news periodically
+    scheduler.add_job(update_news, 'interval', minutes=30)
+    scheduler.start()
+    yield
+    # stop the scheduler when closing the server
+    scheduler.shutdown()
+app = FastAPI(lifespan=lifespan)
+templates = Jinja2Templates(directory='app/templates')
+def safe_update_news():
+    '''
+    Wrapper for update_news to ensure only one instance runs at a time.
+    '''
+    # check whether there is no update currently in progress
+    if not is_updating.locked():
+        # update news in background
+        with is_updating:
+            update_news()
+@app.get('/')
+async def read_root(request: Request, background_tasks: BackgroundTasks):
+    # retrieve articles from database
+    with get_session() as session:
+        articles = retrieve_articles(session=session)
+    # no articles yet
+    if not articles:
+        # update news in background
+        background_tasks.add_task(safe_update_news)
+        return templates.TemplateResponse(
+            'index.html',
+            {
+                'request': request,
+                'articles': articles,
+                'message': 'Summarizing news articles now.'
+                           ' Please check back in a few minutes.'
+            }
+        )
+    return templates.TemplateResponse(
+        'index.html',
+        {
+            'request': request,
+            'articles': articles,
+            'message': 'Additional articles are being summarized right now.' if is_updating.locked() else None
+        }
+    )

app/news_fetcher.py CHANGED Viewed

@@ -1,56 +1,56 @@
-import feedparser
-from .summarizer import Summarizer
-from .scraper import get_articles
-from .database import Article, Source
-def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
-    print(f'Beginning search for \'{query}\'...')
-    articles_and_urls = get_articles(query, max_results=max_results)
-    print(f'Retrieved {len(articles_and_urls)} articles')
-    articles = [item[0] for item in articles_and_urls]
-    urls = [item[1] for item in articles_and_urls]
-    summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
-    return summary, urls
-def fetch_headlines(summarizer: Summarizer, rss_feed_urls: list[str], max_articles=30) -> list[str]:
-    # parse the RSS feeds
-    feeds = [feedparser.parse(url) for url in rss_feed_urls]
-    # extract max_article headlines from each feed
-    headlines = [entry.title for feed in feeds for entry in feed.entries[:max_articles]]
-    # create embeddings for each headline
-    embeddings = summarizer.create_embeddings(headlines)
-    # cluster headlines based on embeddings
-    clusters = summarizer.cluster_sentences(headlines, embeddings)
-    clustered_headlines = []
-    for cluster in clusters:
-        ranked_headlines = summarizer.rank_cluster_sentences(cluster)
-        generic_headline = ranked_headlines[0]
-        clustered_headlines.append(generic_headline)
-    return clustered_headlines
-def news_summary(summarizer: Summarizer, rss_feed_urls: list[str]):
-    # fetch current headlines
-    headlines = fetch_headlines(summarizer, rss_feed_urls)
-    # summarize each topic and yield articles
-    for headline in headlines:
-        summary, source_urls = topic_summary(summarizer, headline)
-        article = Article(
-            title=headline,
-            content=summary
-        )
-        sources = [Source(url=s_url, article_id=article.id) for s_url in source_urls]
         yield article, sources

+import feedparser
+from .summarizer import Summarizer
+from .scraper import get_articles
+from .database import Article, Source
+def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
+    print(f'Beginning search for \'{query}\'...')
+    articles_and_urls = get_articles(query, max_results=max_results)
+    print(f'Retrieved {len(articles_and_urls)} articles')
+    articles = [item[0] for item in articles_and_urls]
+    urls = [item[1] for item in articles_and_urls]
+    summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
+    return summary, urls
+def fetch_headlines(summarizer: Summarizer, rss_feed_urls: list[str], max_articles=30) -> list[str]:
+    # parse the RSS feeds
+    feeds = [feedparser.parse(url) for url in rss_feed_urls]
+    # extract max_article headlines from each feed
+    headlines = [entry.title for feed in feeds for entry in feed.entries[:max_articles]]
+    # create embeddings for each headline
+    embeddings = summarizer.create_embeddings(headlines)
+    # cluster headlines based on embeddings
+    clusters = summarizer.cluster_sentences(headlines, embeddings)
+    clustered_headlines = []
+    for cluster in clusters:
+        ranked_headlines = summarizer.rank_cluster_sentences(cluster)
+        generic_headline = ranked_headlines[0]
+        clustered_headlines.append(generic_headline)
+    return clustered_headlines
+def news_summary(summarizer: Summarizer, rss_feed_urls: list[str]):
+    # fetch current headlines
+    headlines = fetch_headlines(summarizer, rss_feed_urls)
+    # summarize each topic and yield articles
+    for headline in headlines:
+        summary, source_urls = topic_summary(summarizer, headline)
+        article = Article(
+            title=headline,
+            content=summary
+        )
+        sources = [Source(url=s_url, article_id=article.id) for s_url in source_urls]
         yield article, sources

app/scraper.py CHANGED Viewed

@@ -1,28 +1,28 @@
-from ddgs import DDGS
-import trafilatura
-def retrieve_article(url: str) -> str:
-    try:
-        page = trafilatura.fetch_url(url)
-        return trafilatura.extract(page)
-    except Exception as e:
-        print(e.args)
-        return None
-def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
-    # search DuckDuckGo for news on query
-    search_results = []
-    try:
-        search_results = DDGS().news(query, timelimit='d', max_results=max_results)
-    except Exception as e:
-        print(e.args)
-    # get article urls
-    urls = set([r['url'] for r in search_results])
-    # try to retrieve articles (filter inaccessible and short articles)
-    articles = [(article, url) for url in urls if (article := retrieve_article(url)) is not None and len(article) > min_article_length]
     return articles

+from ddgs import DDGS
+import trafilatura
+def retrieve_article(url: str) -> str:
+    try:
+        page = trafilatura.fetch_url(url)
+        return trafilatura.extract(page)
+    except Exception as e:
+        print(e.args)
+        return None
+def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
+    # search DuckDuckGo for news on query
+    search_results = []
+    try:
+        search_results = DDGS().news(query, timelimit='d', max_results=max_results)
+    except Exception as e:
+        print(e.args)
+    # get article urls
+    urls = set([r['url'] for r in search_results])
+    # try to retrieve articles (filter inaccessible and short articles)
+    articles = [(article, url) for url in urls if (article := retrieve_article(url)) is not None and len(article) > min_article_length]
     return articles

app/summarizer.py CHANGED Viewed

@@ -1,120 +1,120 @@
-import nltk
-from nltk import tokenize
-from sklearn.cluster import HDBSCAN
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-class Summarizer:
-    def __init__(self, embedding_model, summarization_model):
-        nltk.download('punkt_tab') # for tokenizing into sentences
-        self.embedding_model = embedding_model
-        self.summarization_model = summarization_model
-    def cluster_sentences(self, sentences, embeddings, min_cluster_size=2):
-        hdb = HDBSCAN(min_cluster_size=min_cluster_size).fit(embeddings)
-        clusters = {}
-        for i, sentence in enumerate(sentences):
-            cluster_id = hdb.labels_[i]
-            if cluster_id == -1: # discard "noise"
-                continue
-            if cluster_id not in clusters:
-                clusters[cluster_id] = []
-            clusters[cluster_id].append((sentence, embeddings[i]))
-        return list(clusters.values())
-    def create_embeddings(self, sentences):
-        return self.embedding_model.encode(sentences)
-    def summarize(self, content, min_length=30, max_length=130):
-        # truncate content if exceeds model capabilities
-        max_model_length = self.summarization_model.model.config.max_position_embeddings
-        if len(content) > max_model_length:
-            content = content[:max_model_length]
-        max_length = max(min_length, min(len(content), max_length))
-        return self.summarization_model(
-            content,
-            min_length=min_length,
-            max_length=max_length,
-            do_sample=False)[0]['summary_text']
-    @staticmethod
-    def rank_cluster_sentences(cluster):
-        # separate sentences and embeddings
-        sentences = [entry[0] for entry in cluster]
-        embeddings = [entry[1] for entry in cluster]
-        # find center of cluster
-        center = np.mean(embeddings, axis=0)
-        # score sentences by similarity to center
-        scores = cosine_similarity([center], embeddings)[0]
-        # rank sentences by score
-        ranked_sentences = [sentence for _, sentence in sorted(zip(scores, sentences), reverse=True)]
-        return ranked_sentences
-    def summarize_clusters(self, clusters, top_cluster_count=5):
-        # sort clusters by their length (descending) to find the most important topics
-        clusters = sorted(clusters, key=len, reverse=True)
-        # take only the top_cluster_count clusters
-        clusters = clusters[:top_cluster_count]
-        # combine key sentences from each cluster
-        key_sentences = []
-        for i, cluster in enumerate(clusters):
-            print(f'Extracting from cluster {i + 1}...')
-            top_sentences = Summarizer.rank_cluster_sentences(cluster)
-            content = '\n'.join(top_sentences[:3])
-            key_sentences.append(content)
-        combined = ' '.join(key_sentences)
-        # summarize all key sentences
-        print('Creating response...')
-        summary = self.summarize(
-            combined,
-            min_length=60,
-            max_length=250
-        )
-        return summary
-    def combined_summary(self, articles, min_cluster_size=3):
-        if not articles:
-            return 'No articles to summarize.'
-        print('Tokenizing into sentences...')
-        # create a list of all sentences from all articles
-        sentences = []
-        for article in articles:
-            sentences.extend(tokenize.sent_tokenize(str.strip(article)))
-        # remove duplicate sentences
-        sentences = sorted(list(set(sentences)), key=sentences.index)
-        print(f'Found {len(sentences)} unique sentences')
-        if not sentences:
-            return None
-        print('Creating sentence embeddings...')
-        # create embeddings
-        embeddings = self.create_embeddings(sentences)
-        print('Grouping sentences into clusters...')
-        # group (embeddings of) sentences by similarity
-        clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
-        print(f'Created {len(clusters)} clusters')
-        # summarize all clusters into a single summary
-        summary = self.summarize_clusters(clusters)
         return summary

+import nltk
+from nltk import tokenize
+from sklearn.cluster import HDBSCAN
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+class Summarizer:
+    def __init__(self, embedding_model, summarization_model):
+        nltk.download('punkt_tab') # for tokenizing into sentences
+        self.embedding_model = embedding_model
+        self.summarization_model = summarization_model
+    def cluster_sentences(self, sentences, embeddings, min_cluster_size=2):
+        hdb = HDBSCAN(min_cluster_size=min_cluster_size).fit(embeddings)
+        clusters = {}
+        for i, sentence in enumerate(sentences):
+            cluster_id = hdb.labels_[i]
+            if cluster_id == -1: # discard "noise"
+                continue
+            if cluster_id not in clusters:
+                clusters[cluster_id] = []
+            clusters[cluster_id].append((sentence, embeddings[i]))
+        return list(clusters.values())
+    def create_embeddings(self, sentences):
+        return self.embedding_model.encode(sentences)
+    def summarize(self, content, min_length=30, max_length=130):
+        # truncate content if exceeds model capabilities
+        max_model_length = self.summarization_model.model.config.max_position_embeddings
+        if len(content) > max_model_length:
+            content = content[:max_model_length]
+        max_length = max(min_length, min(len(content), max_length))
+        return self.summarization_model(
+            content,
+            min_length=min_length,
+            max_length=max_length,
+            do_sample=False)[0]['summary_text']
+    @staticmethod
+    def rank_cluster_sentences(cluster):
+        # separate sentences and embeddings
+        sentences = [entry[0] for entry in cluster]
+        embeddings = [entry[1] for entry in cluster]
+        # find center of cluster
+        center = np.mean(embeddings, axis=0)
+        # score sentences by similarity to center
+        scores = cosine_similarity([center], embeddings)[0]
+        # rank sentences by score
+        ranked_sentences = [sentence for _, sentence in sorted(zip(scores, sentences), reverse=True)]
+        return ranked_sentences
+    def summarize_clusters(self, clusters, top_cluster_count=5):
+        # sort clusters by their length (descending) to find the most important topics
+        clusters = sorted(clusters, key=len, reverse=True)
+        # take only the top_cluster_count clusters
+        clusters = clusters[:top_cluster_count]
+        # combine key sentences from each cluster
+        key_sentences = []
+        for i, cluster in enumerate(clusters):
+            print(f'Extracting from cluster {i + 1}...')
+            top_sentences = Summarizer.rank_cluster_sentences(cluster)
+            content = '\n'.join(top_sentences[:3])
+            key_sentences.append(content)
+        combined = ' '.join(key_sentences)
+        # summarize all key sentences
+        print('Creating response...')
+        summary = self.summarize(
+            combined,
+            min_length=60,
+            max_length=250
+        )
+        return summary
+    def combined_summary(self, articles, min_cluster_size=3):
+        if not articles:
+            return 'No articles to summarize.'
+        print('Tokenizing into sentences...')
+        # create a list of all sentences from all articles
+        sentences = []
+        for article in articles:
+            sentences.extend(tokenize.sent_tokenize(str.strip(article)))
+        # remove duplicate sentences
+        sentences = sorted(list(set(sentences)), key=sentences.index)
+        print(f'Found {len(sentences)} unique sentences')
+        if not sentences:
+            return None
+        print('Creating sentence embeddings...')
+        # create embeddings
+        embeddings = self.create_embeddings(sentences)
+        print('Grouping sentences into clusters...')
+        # group (embeddings of) sentences by similarity
+        clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
+        print(f'Created {len(clusters)} clusters')
+        # summarize all clusters into a single summary
+        summary = self.summarize_clusters(clusters)
         return summary

app/templates/index.html CHANGED Viewed

@@ -1,123 +1,123 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Focal</title>
-    <script src="https://cdn.tailwindcss.com"></script>
-    <link rel="preconnect" href="https://fonts.googleapis.com">
-    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
-    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@600&family=Inter:wght@400;500&display=swap" rel="stylesheet">
-    <script>
-        tailwind.config = {
-            theme: {
-                extend: {
-                    fontFamily: {
-                        sans: ['Inter', 'sans-serif'],
-                        display: ['Poppins', 'sans-serif'],
-                    },
-                    colors: {
-                        'background': '#FFFFFF',
-                        'background-light': '#F9F9F9',
-                        'text-primary': '#121212',
-                        'primary': '#2563EB',
-                        'primary-dark': '#1E40AF',
-                    }
-                }
-            }
-        }
-    </script>
-    <style>
-        .collapsible-content {
-            max-height: 0;
-            overflow: hidden;
-            transition: max-height 0.5s cubic-bezier(0.4, 0, 0.2, 1);
-        }
-        .collapsible-content.expanded {
-            max-height: 500px;
-        }
-        .collapsible-content .border {
-            border-color: #E5E7EB;
-        }
-    </style>
-</head>
-<body class="bg-background text-text-primary font-sans antialiased">
-    <header class="container mx-auto px-6 py-5">
-        <div class="flex items-center border-b border-primary/20 pb-3">
-            <svg class="h-8 w-8 mr-2" viewBox="0 0 32 32" fill="black" xmlns="http://www.w3.org/2000/svg">
-                <circle cx="16" cy="16" r="10" />
-            </svg>
-            <a href="#" class="font-display text-2xl tracking-widest uppercase">Focal</a>
-        </div>
-    </header>
-    <main class="container mx-auto px-6 py-12 max-w-4xl">
-        {% if message %}
-        <div class="text-center text-lg text-text-primary">
-            {{ message }}
-        </div>
-        {% endif %}
-        {% for article in articles %}
-        <article class="mb-20 md:mb-24">
-            <p class="text-sm font-medium text-primary uppercase tracking-wider mb-3">{{ article.date }}</p>
-            <h1 class="font-display font-semibold text-2xl md:text-3xl text-text-primary mb-6 leading-tight">
-                {{ article.title }}
-            </h1>
-            <div class="text-lg text-text-primary/80 leading-relaxed space-y-5">
-                <p>
-                    {{ article.content }}
-                </p>
-            </div>
-            {% if article.sources %}
-            <div class="mt-8">
-                <button class="toggle-button flex items-center text-primary font-medium text-sm transition-colors hover:text-text-primary">
-                    <span class="mr-2">Further Reading</span>
-                    <svg class="plus-icon h-4 w-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M12 4.5v15m7.5-7.5h-15" /></svg>
-                    <svg class="minus-icon h-4 w-4 hidden" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M19.5 12h-15" /></svg>
-                </button>
-                <div class="collapsible-content mt-4">
-                    <div class="bg-background-light p-5 rounded-md border border-gray-700/50">
-                        <ul class="space-y-3 list-disc list-inside text-text-primary/70">
-                            {% for source in article.sources %}
-                            <li><a href="{{ source.url }}" class="hover:text-primary transition-colors">{{ source.url }}</a></li>
-                            {% endfor %}
-                        </ul>
-                    </div>
-                </div>
-            </div>
-            {% endif %}
-        </article>
-        {% endfor %}
-    </main>
-    <footer class="container mx-auto px-6 py-4 text-center text-sm text-text-primary/60">
-        <p>All content is AI-generated, and may contain inaccuracies.</p>
-    </footer>
-    <script>
-        // JavaScript for the collapsible sections
-        document.addEventListener('DOMContentLoaded', function () {
-            const allToggleButtons = document.querySelectorAll('.toggle-button');
-            allToggleButtons.forEach(button => {
-                button.addEventListener('click', function () {
-                    const content = this.nextElementSibling;
-                    const plusIcon = this.querySelector('.plus-icon');
-                    const minusIcon = this.querySelector('.minus-icon');
-                    content.classList.toggle('expanded');
-                    plusIcon.classList.toggle('hidden');
-                    minusIcon.classList.toggle('hidden');
-                });
-            });
-        });
-    </script>
-</body>
 </html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Focal</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@600&family=Inter:wght@400;500&display=swap" rel="stylesheet">
+    <script>
+        tailwind.config = {
+            theme: {
+                extend: {
+                    fontFamily: {
+                        sans: ['Inter', 'sans-serif'],
+                        display: ['Poppins', 'sans-serif'],
+                    },
+                    colors: {
+                        'background': '#FFFFFF',
+                        'background-light': '#F9F9F9',
+                        'text-primary': '#121212',
+                        'primary': '#2563EB',
+                        'primary-dark': '#1E40AF',
+                    }
+                }
+            }
+        }
+    </script>
+    <style>
+        .collapsible-content {
+            max-height: 0;
+            overflow: hidden;
+            transition: max-height 0.5s cubic-bezier(0.4, 0, 0.2, 1);
+        }
+        .collapsible-content.expanded {
+            max-height: 500px;
+        }
+        .collapsible-content .border {
+            border-color: #E5E7EB;
+        }
+    </style>
+</head>
+<body class="bg-background text-text-primary font-sans antialiased">
+    <header class="container mx-auto px-6 py-5">
+        <div class="flex items-center border-b border-primary/20 pb-3">
+            <svg class="h-8 w-8 mr-2" viewBox="0 0 32 32" fill="black" xmlns="http://www.w3.org/2000/svg">
+                <circle cx="16" cy="16" r="10" />
+            </svg>
+            <a href="#" class="font-display text-2xl tracking-widest uppercase">Focal</a>
+        </div>
+    </header>
+    <main class="container mx-auto px-6 py-12 max-w-4xl">
+        {% if message %}
+        <div class="text-center text-lg text-text-primary">
+            {{ message }}
+        </div>
+        {% endif %}
+        {% for article in articles %}
+        <article class="mb-20 md:mb-24">
+            <p class="text-sm font-medium text-primary uppercase tracking-wider mb-3">{{ article.date }}</p>
+            <h1 class="font-display font-semibold text-2xl md:text-3xl text-text-primary mb-6 leading-tight">
+                {{ article.title }}
+            </h1>
+            <div class="text-lg text-text-primary/80 leading-relaxed space-y-5">
+                <p>
+                    {{ article.content }}
+                </p>
+            </div>
+            {% if article.sources %}
+            <div class="mt-8">
+                <button class="toggle-button flex items-center text-primary font-medium text-sm transition-colors hover:text-text-primary">
+                    <span class="mr-2">Further Reading</span>
+                    <svg class="plus-icon h-4 w-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M12 4.5v15m7.5-7.5h-15" /></svg>
+                    <svg class="minus-icon h-4 w-4 hidden" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M19.5 12h-15" /></svg>
+                </button>
+                <div class="collapsible-content mt-4">
+                    <div class="bg-background-light p-5 rounded-md border border-gray-700/50">
+                        <ul class="space-y-3 list-disc list-inside text-text-primary/70">
+                            {% for source in article.sources %}
+                            <li><a href="{{ source.url }}" class="hover:text-primary transition-colors">{{ source.url }}</a></li>
+                            {% endfor %}
+                        </ul>
+                    </div>
+                </div>
+            </div>
+            {% endif %}
+        </article>
+        {% endfor %}
+    </main>
+    <footer class="container mx-auto px-6 py-4 text-center text-sm text-text-primary/60">
+        <p>All content is AI-generated, and may contain inaccuracies.</p>
+    </footer>
+    <script>
+        // JavaScript for the collapsible sections
+        document.addEventListener('DOMContentLoaded', function () {
+            const allToggleButtons = document.querySelectorAll('.toggle-button');
+            allToggleButtons.forEach(button => {
+                button.addEventListener('click', function () {
+                    const content = this.nextElementSibling;
+                    const plusIcon = this.querySelector('.plus-icon');
+                    const minusIcon = this.querySelector('.minus-icon');
+                    content.classList.toggle('expanded');
+                    plusIcon.classList.toggle('hidden');
+                    minusIcon.classList.toggle('hidden');
+                });
+            });
+        });
+    </script>
+</body>
 </html>

app/update_news.py CHANGED Viewed

@@ -1,50 +1,50 @@
-from sentence_transformers import SentenceTransformer
-from transformers import pipeline
-from .summarizer import Summarizer
-from .news_fetcher import news_summary
-from .database import get_session, clear_articles, add_article, add_sources
-import datetime
-def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
-    try:
-        with open(filename, 'r') as f:
-            urls = [url.strip() for url in f.readlines()]
-            return urls
-    except:
-        return None
-def update_news():
-    print(f'Initiating news update: {datetime.datetime.now()}')
-    # model to create embeddings from sentences
-    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-    # model to summarize text
-    summarizing_model = pipeline("summarization", model="facebook/bart-large-cnn")
-    # wraps models
-    summarizer = Summarizer(embedding_model=embedding_model, summarization_model=summarizing_model)
-    # rss feeds for current headlines
-    rss_feed_urls = read_rss_feed_urls()
-    # clear old articles
-    with get_session() as session:
-        clear_articles(session)
-    # summarize articles and add to database
-    for article, sources in news_summary(summarizer, rss_feed_urls):
-        with get_session() as session:
-            add_article(session, article)
-            for source in sources:
-                source.article_id = article.id
-            add_sources(session, sources)
-if __name__ == '__main__':
-    update_news()

+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+from .summarizer import Summarizer
+from .news_fetcher import news_summary
+from .database import get_session, clear_articles, add_article, add_sources
+import datetime
+def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
+    try:
+        with open(filename, 'r') as f:
+            urls = [url.strip() for url in f.readlines()]
+            return urls
+    except:
+        return None
+def update_news():
+    print(f'Initiating news update: {datetime.datetime.now()}')
+    # model to create embeddings from sentences
+    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+    # model to summarize text
+    summarizing_model = pipeline("summarization", model="facebook/bart-large-cnn")
+    # wraps models
+    summarizer = Summarizer(embedding_model=embedding_model, summarization_model=summarizing_model)
+    # rss feeds for current headlines
+    rss_feed_urls = read_rss_feed_urls()
+    # clear old articles
+    with get_session() as session:
+        clear_articles(session)
+    # summarize articles and add to database
+    for article, sources in news_summary(summarizer, rss_feed_urls):
+        with get_session() as session:
+            add_article(session, article)
+            for source in sources:
+                source.article_id = article.id
+            add_sources(session, sources)
+if __name__ == '__main__':
+    update_news()

compose.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# Comments are provided throughout this file to help you get started.
+# If you need more help, visit the Docker Compose reference guide at
+# https://docs.docker.com/go/compose-spec-reference/
+# Here the instructions define your application as a service called "server".
+# This service is built from the Dockerfile in the current directory.
+# You can add other services your application may depend on here, such as a
+# database or a cache. For examples, see the Awesome Compose repository:
+# https://github.com/docker/awesome-compose
+services:
+  server:
+    build:
+      context: .
+    ports:
+      - 7860:7860
+# The commented out section below is an example of how to define a PostgreSQL
+# database that your application can use. `depends_on` tells Docker Compose to
+# start the database before your application. The `db-data` volume persists the
+# database data between container restarts. The `db-password` secret is used
+# to set the database password. You must create `db/password.txt` and add
+# a password of your choosing to it before running `docker compose up`.
+#     depends_on:
+#       db:
+#         condition: service_healthy
+#   db:
+#     image: postgres
+#     restart: always
+#     user: postgres
+#     secrets:
+#       - db-password
+#     volumes:
+#       - db-data:/var/lib/postgresql/data
+#     environment:
+#       - POSTGRES_DB=example
+#       - POSTGRES_PASSWORD_FILE=/run/secrets/db-password
+#     expose:
+#       - 5432
+#     healthcheck:
+#       test: [ "CMD", "pg_isready" ]
+#       interval: 10s
+#       timeout: 5s
+#       retries: 5
+# volumes:
+#   db-data:
+# secrets:
+#   db-password:
+#     file: db/password.txt

rss_feeds.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-http://feeds.bbci.co.uk/news/world/rss.xml
-http://rss.cnn.com/rss/cnn_topstories.rss
-http://feeds.nytimes.com/nyt/rss/HomePage
 https://www.theguardian.com/world/rss

+http://feeds.bbci.co.uk/news/world/rss.xml
+http://rss.cnn.com/rss/cnn_topstories.rss
+http://feeds.nytimes.com/nyt/rss/HomePage
 https://www.theguardian.com/world/rss

test.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from database import DatabaseConnection
-db = DatabaseConnection()
-print(db.retrieve_articles())
 db.close()

+from database import DatabaseConnection
+db = DatabaseConnection()
+print(db.retrieve_articles())
 db.close()