Spaces:
Running
Running
michaelkri commited on
Commit ·
5d924ac
1
Parent(s): 469e535
Added Docker
Browse files- .devcontainer/devcontainer.json +24 -0
- .dockerignore +34 -0
- .github/dependabot.yml +12 -0
- .gitignore +3 -3
- Dockerfile +13 -0
- README.Docker.md +22 -0
- app/database.py +85 -85
- app/main.py +71 -71
- app/news_fetcher.py +55 -55
- app/scraper.py +27 -27
- app/summarizer.py +119 -119
- app/templates/index.html +122 -122
- app/update_news.py +50 -50
- compose.yaml +49 -0
- rss_feeds.txt +3 -3
- test.py +3 -3
.devcontainer/devcontainer.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// The name of the dev container
|
| 3 |
+
"name": "Focal",
|
| 4 |
+
|
| 5 |
+
// The Dockerfile to use for building the dev container
|
| 6 |
+
"dockerFile": "../Dockerfile",
|
| 7 |
+
|
| 8 |
+
// This is the key setting to prevent VS Code from overriding your CMD.
|
| 9 |
+
// Setting it to 'false' ensures your Dockerfile's CMD command is respected.
|
| 10 |
+
"overrideCommand": false,
|
| 11 |
+
|
| 12 |
+
// Any ports to forward from the container to your local machine
|
| 13 |
+
"forwardPorts": [7860],
|
| 14 |
+
|
| 15 |
+
// Extensions to install in the container
|
| 16 |
+
"customizations": {
|
| 17 |
+
"vscode": {
|
| 18 |
+
"extensions": [
|
| 19 |
+
"ms-python.python",
|
| 20 |
+
"ms-python.vscode-pylance"
|
| 21 |
+
]
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
.dockerignore
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Include any files or directories that you don't want to be copied to your
|
| 2 |
+
# container here (e.g., local build artifacts, temporary files, etc.).
|
| 3 |
+
#
|
| 4 |
+
# For more help, visit the .dockerignore file reference guide at
|
| 5 |
+
# https://docs.docker.com/go/build-context-dockerignore/
|
| 6 |
+
|
| 7 |
+
**/.DS_Store
|
| 8 |
+
**/__pycache__
|
| 9 |
+
**/.venv
|
| 10 |
+
**/.classpath
|
| 11 |
+
**/.dockerignore
|
| 12 |
+
**/.env
|
| 13 |
+
**/.git
|
| 14 |
+
**/.gitignore
|
| 15 |
+
**/.project
|
| 16 |
+
**/.settings
|
| 17 |
+
**/.toolstarget
|
| 18 |
+
**/.vs
|
| 19 |
+
**/.vscode
|
| 20 |
+
**/*.*proj.user
|
| 21 |
+
**/*.dbmdl
|
| 22 |
+
**/*.jfm
|
| 23 |
+
**/bin
|
| 24 |
+
**/charts
|
| 25 |
+
**/docker-compose*
|
| 26 |
+
**/compose.y*ml
|
| 27 |
+
**/Dockerfile*
|
| 28 |
+
**/node_modules
|
| 29 |
+
**/npm-debug.log
|
| 30 |
+
**/obj
|
| 31 |
+
**/secrets.dev.yaml
|
| 32 |
+
**/values.dev.yaml
|
| 33 |
+
LICENSE
|
| 34 |
+
README.md
|
.github/dependabot.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# To get started with Dependabot version updates, you'll need to specify which
|
| 2 |
+
# package ecosystems to update and where the package manifests are located.
|
| 3 |
+
# Please see the documentation for more information:
|
| 4 |
+
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
|
| 5 |
+
# https://containers.dev/guide/dependabot
|
| 6 |
+
|
| 7 |
+
version: 2
|
| 8 |
+
updates:
|
| 9 |
+
- package-ecosystem: "devcontainers"
|
| 10 |
+
directory: "/"
|
| 11 |
+
schedule:
|
| 12 |
+
interval: weekly
|
.gitignore
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
__pycache__
|
| 2 |
-
venv
|
| 3 |
-
.venv
|
| 4 |
*.db
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
venv
|
| 3 |
+
.venv
|
| 4 |
*.db
|
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
|
| 3 |
+
RUN useradd -m -u 1000 user
|
| 4 |
+
USER user
|
| 5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY --chown=user . /app
|
| 13 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.Docker.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### Building and running your application
|
| 2 |
+
|
| 3 |
+
When you're ready, start your application by running:
|
| 4 |
+
`docker compose up --build`.
|
| 5 |
+
|
| 6 |
+
Your application will be available at http://localhost:7860.
|
| 7 |
+
|
| 8 |
+
### Deploying your application to the cloud
|
| 9 |
+
|
| 10 |
+
First, build your image, e.g.: `docker build -t myapp .`.
|
| 11 |
+
If your cloud uses a different CPU architecture than your development
|
| 12 |
+
machine (e.g., you are on a Mac M1 and your cloud provider is amd64),
|
| 13 |
+
you'll want to build the image for that platform, e.g.:
|
| 14 |
+
`docker build --platform=linux/amd64 -t myapp .`.
|
| 15 |
+
|
| 16 |
+
Then, push it to your registry, e.g. `docker push myregistry.com/myapp`.
|
| 17 |
+
|
| 18 |
+
Consult Docker's [getting started](https://docs.docker.com/go/get-started-sharing/)
|
| 19 |
+
docs for more detail on building and pushing.
|
| 20 |
+
|
| 21 |
+
### References
|
| 22 |
+
* [Docker's Python guide](https://docs.docker.com/language/python/)
|
app/database.py
CHANGED
|
@@ -1,86 +1,86 @@
|
|
| 1 |
-
from sqlalchemy import create_engine, UnicodeText, DateTime, ForeignKey
|
| 2 |
-
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session, joinedload, relationship
|
| 3 |
-
from sqlalchemy.sql import func
|
| 4 |
-
from contextlib import contextmanager
|
| 5 |
-
import datetime
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class Base(DeclarativeBase):
|
| 9 |
-
pass
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class Article(Base):
|
| 13 |
-
__tablename__ = 'article'
|
| 14 |
-
|
| 15 |
-
id: Mapped[int] = mapped_column(primary_key=True)
|
| 16 |
-
title: Mapped[str] = mapped_column(UnicodeText)
|
| 17 |
-
content: Mapped[str] = mapped_column(UnicodeText)
|
| 18 |
-
date: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
|
| 19 |
-
|
| 20 |
-
sources = relationship('Source', backref='article')
|
| 21 |
-
|
| 22 |
-
def __repr__(self) -> str:
|
| 23 |
-
return f'Article(id={self.id}, title={self.title}, content={self.content})'
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
class Source(Base):
|
| 27 |
-
'''
|
| 28 |
-
Represents a source URL for an article.
|
| 29 |
-
'''
|
| 30 |
-
__tablename__ = 'source'
|
| 31 |
-
|
| 32 |
-
id: Mapped[int] = mapped_column(primary_key=True)
|
| 33 |
-
url: Mapped[str] = mapped_column(UnicodeText)
|
| 34 |
-
article_id: Mapped[int] = mapped_column(ForeignKey('article.id'))
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# create an engine
|
| 38 |
-
engine = create_engine('sqlite:///news.db', echo=True)
|
| 39 |
-
|
| 40 |
-
# create tables if needed
|
| 41 |
-
Base.metadata.create_all(engine)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
@contextmanager
|
| 45 |
-
def get_session():
|
| 46 |
-
'''
|
| 47 |
-
Context manager for creating and closing a database session
|
| 48 |
-
'''
|
| 49 |
-
# create a session
|
| 50 |
-
session = Session(engine)
|
| 51 |
-
try:
|
| 52 |
-
yield session
|
| 53 |
-
finally:
|
| 54 |
-
session.close()
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def add_article(session: Session, article: Article):
|
| 58 |
-
'''
|
| 59 |
-
Adds a new article to the database
|
| 60 |
-
'''
|
| 61 |
-
session.add(article)
|
| 62 |
-
session.commit()
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def retrieve_articles(session: Session):
|
| 66 |
-
'''
|
| 67 |
-
Returns a list containing all articles
|
| 68 |
-
'''
|
| 69 |
-
return session.query(Article).options(joinedload(Article.sources)).all()
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def clear_articles(session: Session):
|
| 73 |
-
'''
|
| 74 |
-
Deletes all articles in the database
|
| 75 |
-
'''
|
| 76 |
-
session.query(Article).delete()
|
| 77 |
-
session.commit()
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def add_sources(session: Session, sources: list[Source]):
|
| 81 |
-
'''
|
| 82 |
-
Adds the given sources to the database
|
| 83 |
-
'''
|
| 84 |
-
for source in sources:
|
| 85 |
-
session.add(source)
|
| 86 |
session.commit()
|
|
|
|
| 1 |
+
from sqlalchemy import create_engine, UnicodeText, DateTime, ForeignKey
|
| 2 |
+
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session, joinedload, relationship
|
| 3 |
+
from sqlalchemy.sql import func
|
| 4 |
+
from contextlib import contextmanager
|
| 5 |
+
import datetime
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Base(DeclarativeBase):
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Article(Base):
|
| 13 |
+
__tablename__ = 'article'
|
| 14 |
+
|
| 15 |
+
id: Mapped[int] = mapped_column(primary_key=True)
|
| 16 |
+
title: Mapped[str] = mapped_column(UnicodeText)
|
| 17 |
+
content: Mapped[str] = mapped_column(UnicodeText)
|
| 18 |
+
date: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
|
| 19 |
+
|
| 20 |
+
sources = relationship('Source', backref='article')
|
| 21 |
+
|
| 22 |
+
def __repr__(self) -> str:
|
| 23 |
+
return f'Article(id={self.id}, title={self.title}, content={self.content})'
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class Source(Base):
|
| 27 |
+
'''
|
| 28 |
+
Represents a source URL for an article.
|
| 29 |
+
'''
|
| 30 |
+
__tablename__ = 'source'
|
| 31 |
+
|
| 32 |
+
id: Mapped[int] = mapped_column(primary_key=True)
|
| 33 |
+
url: Mapped[str] = mapped_column(UnicodeText)
|
| 34 |
+
article_id: Mapped[int] = mapped_column(ForeignKey('article.id'))
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# create an engine
|
| 38 |
+
engine = create_engine('sqlite:///news.db', echo=True)
|
| 39 |
+
|
| 40 |
+
# create tables if needed
|
| 41 |
+
Base.metadata.create_all(engine)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@contextmanager
|
| 45 |
+
def get_session():
|
| 46 |
+
'''
|
| 47 |
+
Context manager for creating and closing a database session
|
| 48 |
+
'''
|
| 49 |
+
# create a session
|
| 50 |
+
session = Session(engine)
|
| 51 |
+
try:
|
| 52 |
+
yield session
|
| 53 |
+
finally:
|
| 54 |
+
session.close()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def add_article(session: Session, article: Article):
|
| 58 |
+
'''
|
| 59 |
+
Adds a new article to the database
|
| 60 |
+
'''
|
| 61 |
+
session.add(article)
|
| 62 |
+
session.commit()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def retrieve_articles(session: Session):
|
| 66 |
+
'''
|
| 67 |
+
Returns a list containing all articles
|
| 68 |
+
'''
|
| 69 |
+
return session.query(Article).options(joinedload(Article.sources)).all()
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def clear_articles(session: Session):
|
| 73 |
+
'''
|
| 74 |
+
Deletes all articles in the database
|
| 75 |
+
'''
|
| 76 |
+
session.query(Article).delete()
|
| 77 |
+
session.commit()
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def add_sources(session: Session, sources: list[Source]):
|
| 81 |
+
'''
|
| 82 |
+
Adds the given sources to the database
|
| 83 |
+
'''
|
| 84 |
+
for source in sources:
|
| 85 |
+
session.add(source)
|
| 86 |
session.commit()
|
app/main.py
CHANGED
|
@@ -1,71 +1,71 @@
|
|
| 1 |
-
from fastapi import FastAPI, Request, BackgroundTasks
|
| 2 |
-
from fastapi.templating import Jinja2Templates
|
| 3 |
-
from contextlib import asynccontextmanager
|
| 4 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
-
from .database import get_session, retrieve_articles
|
| 6 |
-
from .update_news import update_news
|
| 7 |
-
import threading
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
# for updating the news feed periodically
|
| 11 |
-
scheduler = BackgroundScheduler()
|
| 12 |
-
|
| 13 |
-
# prevent initiating multiple concurrent updates
|
| 14 |
-
is_updating = threading.Lock()
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
@asynccontextmanager
|
| 18 |
-
async def lifespan(app: FastAPI):
|
| 19 |
-
# update news periodically
|
| 20 |
-
scheduler.add_job(update_news, 'interval', minutes=30)
|
| 21 |
-
scheduler.start()
|
| 22 |
-
yield
|
| 23 |
-
# stop the scheduler when closing the server
|
| 24 |
-
scheduler.shutdown()
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
app = FastAPI(lifespan=lifespan)
|
| 28 |
-
|
| 29 |
-
templates = Jinja2Templates(directory='app/templates')
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def safe_update_news():
|
| 33 |
-
'''
|
| 34 |
-
Wrapper for update_news to ensure only one instance runs at a time.
|
| 35 |
-
'''
|
| 36 |
-
# check whether there is no update currently in progress
|
| 37 |
-
if not is_updating.locked():
|
| 38 |
-
# update news in background
|
| 39 |
-
with is_updating:
|
| 40 |
-
update_news()
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
@app.get('/')
|
| 44 |
-
async def read_root(request: Request, background_tasks: BackgroundTasks):
|
| 45 |
-
# retrieve articles from database
|
| 46 |
-
with get_session() as session:
|
| 47 |
-
articles = retrieve_articles(session=session)
|
| 48 |
-
|
| 49 |
-
# no articles yet
|
| 50 |
-
if not articles:
|
| 51 |
-
# update news in background
|
| 52 |
-
background_tasks.add_task(safe_update_news)
|
| 53 |
-
|
| 54 |
-
return templates.TemplateResponse(
|
| 55 |
-
'index.html',
|
| 56 |
-
{
|
| 57 |
-
'request': request,
|
| 58 |
-
'articles': articles,
|
| 59 |
-
'message': 'Summarizing news articles now.'
|
| 60 |
-
' Please check back in a few minutes.'
|
| 61 |
-
}
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
return templates.TemplateResponse(
|
| 65 |
-
'index.html',
|
| 66 |
-
{
|
| 67 |
-
'request': request,
|
| 68 |
-
'articles': articles,
|
| 69 |
-
'message': 'Additional articles are being summarized right now.' if is_updating.locked() else None
|
| 70 |
-
}
|
| 71 |
-
)
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Request, BackgroundTasks
|
| 2 |
+
from fastapi.templating import Jinja2Templates
|
| 3 |
+
from contextlib import asynccontextmanager
|
| 4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
+
from .database import get_session, retrieve_articles
|
| 6 |
+
from .update_news import update_news
|
| 7 |
+
import threading
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# for updating the news feed periodically
|
| 11 |
+
scheduler = BackgroundScheduler()
|
| 12 |
+
|
| 13 |
+
# prevent initiating multiple concurrent updates
|
| 14 |
+
is_updating = threading.Lock()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@asynccontextmanager
|
| 18 |
+
async def lifespan(app: FastAPI):
|
| 19 |
+
# update news periodically
|
| 20 |
+
scheduler.add_job(update_news, 'interval', minutes=30)
|
| 21 |
+
scheduler.start()
|
| 22 |
+
yield
|
| 23 |
+
# stop the scheduler when closing the server
|
| 24 |
+
scheduler.shutdown()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
app = FastAPI(lifespan=lifespan)
|
| 28 |
+
|
| 29 |
+
templates = Jinja2Templates(directory='app/templates')
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def safe_update_news():
|
| 33 |
+
'''
|
| 34 |
+
Wrapper for update_news to ensure only one instance runs at a time.
|
| 35 |
+
'''
|
| 36 |
+
# check whether there is no update currently in progress
|
| 37 |
+
if not is_updating.locked():
|
| 38 |
+
# update news in background
|
| 39 |
+
with is_updating:
|
| 40 |
+
update_news()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@app.get('/')
|
| 44 |
+
async def read_root(request: Request, background_tasks: BackgroundTasks):
|
| 45 |
+
# retrieve articles from database
|
| 46 |
+
with get_session() as session:
|
| 47 |
+
articles = retrieve_articles(session=session)
|
| 48 |
+
|
| 49 |
+
# no articles yet
|
| 50 |
+
if not articles:
|
| 51 |
+
# update news in background
|
| 52 |
+
background_tasks.add_task(safe_update_news)
|
| 53 |
+
|
| 54 |
+
return templates.TemplateResponse(
|
| 55 |
+
'index.html',
|
| 56 |
+
{
|
| 57 |
+
'request': request,
|
| 58 |
+
'articles': articles,
|
| 59 |
+
'message': 'Summarizing news articles now.'
|
| 60 |
+
' Please check back in a few minutes.'
|
| 61 |
+
}
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
return templates.TemplateResponse(
|
| 65 |
+
'index.html',
|
| 66 |
+
{
|
| 67 |
+
'request': request,
|
| 68 |
+
'articles': articles,
|
| 69 |
+
'message': 'Additional articles are being summarized right now.' if is_updating.locked() else None
|
| 70 |
+
}
|
| 71 |
+
)
|
app/news_fetcher.py
CHANGED
|
@@ -1,56 +1,56 @@
|
|
| 1 |
-
import feedparser
|
| 2 |
-
from .summarizer import Summarizer
|
| 3 |
-
from .scraper import get_articles
|
| 4 |
-
from .database import Article, Source
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
|
| 8 |
-
print(f'Beginning search for \'{query}\'...')
|
| 9 |
-
articles_and_urls = get_articles(query, max_results=max_results)
|
| 10 |
-
print(f'Retrieved {len(articles_and_urls)} articles')
|
| 11 |
-
articles = [item[0] for item in articles_and_urls]
|
| 12 |
-
urls = [item[1] for item in articles_and_urls]
|
| 13 |
-
summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
|
| 14 |
-
return summary, urls
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def fetch_headlines(summarizer: Summarizer, rss_feed_urls: list[str], max_articles=30) -> list[str]:
|
| 18 |
-
# parse the RSS feeds
|
| 19 |
-
feeds = [feedparser.parse(url) for url in rss_feed_urls]
|
| 20 |
-
|
| 21 |
-
# extract max_article headlines from each feed
|
| 22 |
-
headlines = [entry.title for feed in feeds for entry in feed.entries[:max_articles]]
|
| 23 |
-
|
| 24 |
-
# create embeddings for each headline
|
| 25 |
-
embeddings = summarizer.create_embeddings(headlines)
|
| 26 |
-
|
| 27 |
-
# cluster headlines based on embeddings
|
| 28 |
-
clusters = summarizer.cluster_sentences(headlines, embeddings)
|
| 29 |
-
|
| 30 |
-
clustered_headlines = []
|
| 31 |
-
for cluster in clusters:
|
| 32 |
-
ranked_headlines = summarizer.rank_cluster_sentences(cluster)
|
| 33 |
-
|
| 34 |
-
generic_headline = ranked_headlines[0]
|
| 35 |
-
|
| 36 |
-
clustered_headlines.append(generic_headline)
|
| 37 |
-
|
| 38 |
-
return clustered_headlines
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def news_summary(summarizer: Summarizer, rss_feed_urls: list[str]):
|
| 42 |
-
# fetch current headlines
|
| 43 |
-
headlines = fetch_headlines(summarizer, rss_feed_urls)
|
| 44 |
-
|
| 45 |
-
# summarize each topic and yield articles
|
| 46 |
-
for headline in headlines:
|
| 47 |
-
summary, source_urls = topic_summary(summarizer, headline)
|
| 48 |
-
|
| 49 |
-
article = Article(
|
| 50 |
-
title=headline,
|
| 51 |
-
content=summary
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
sources = [Source(url=s_url, article_id=article.id) for s_url in source_urls]
|
| 55 |
-
|
| 56 |
yield article, sources
|
|
|
|
| 1 |
+
import feedparser
|
| 2 |
+
from .summarizer import Summarizer
|
| 3 |
+
from .scraper import get_articles
|
| 4 |
+
from .database import Article, Source
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def topic_summary(summarizer: Summarizer, query: str, max_results: int = 5, min_cluster_size: int = 2) -> str:
|
| 8 |
+
print(f'Beginning search for \'{query}\'...')
|
| 9 |
+
articles_and_urls = get_articles(query, max_results=max_results)
|
| 10 |
+
print(f'Retrieved {len(articles_and_urls)} articles')
|
| 11 |
+
articles = [item[0] for item in articles_and_urls]
|
| 12 |
+
urls = [item[1] for item in articles_and_urls]
|
| 13 |
+
summary = summarizer.combined_summary(articles, min_cluster_size=min_cluster_size)
|
| 14 |
+
return summary, urls
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def fetch_headlines(summarizer: Summarizer, rss_feed_urls: list[str], max_articles=30) -> list[str]:
|
| 18 |
+
# parse the RSS feeds
|
| 19 |
+
feeds = [feedparser.parse(url) for url in rss_feed_urls]
|
| 20 |
+
|
| 21 |
+
# extract max_article headlines from each feed
|
| 22 |
+
headlines = [entry.title for feed in feeds for entry in feed.entries[:max_articles]]
|
| 23 |
+
|
| 24 |
+
# create embeddings for each headline
|
| 25 |
+
embeddings = summarizer.create_embeddings(headlines)
|
| 26 |
+
|
| 27 |
+
# cluster headlines based on embeddings
|
| 28 |
+
clusters = summarizer.cluster_sentences(headlines, embeddings)
|
| 29 |
+
|
| 30 |
+
clustered_headlines = []
|
| 31 |
+
for cluster in clusters:
|
| 32 |
+
ranked_headlines = summarizer.rank_cluster_sentences(cluster)
|
| 33 |
+
|
| 34 |
+
generic_headline = ranked_headlines[0]
|
| 35 |
+
|
| 36 |
+
clustered_headlines.append(generic_headline)
|
| 37 |
+
|
| 38 |
+
return clustered_headlines
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def news_summary(summarizer: Summarizer, rss_feed_urls: list[str]):
|
| 42 |
+
# fetch current headlines
|
| 43 |
+
headlines = fetch_headlines(summarizer, rss_feed_urls)
|
| 44 |
+
|
| 45 |
+
# summarize each topic and yield articles
|
| 46 |
+
for headline in headlines:
|
| 47 |
+
summary, source_urls = topic_summary(summarizer, headline)
|
| 48 |
+
|
| 49 |
+
article = Article(
|
| 50 |
+
title=headline,
|
| 51 |
+
content=summary
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
sources = [Source(url=s_url, article_id=article.id) for s_url in source_urls]
|
| 55 |
+
|
| 56 |
yield article, sources
|
app/scraper.py
CHANGED
|
@@ -1,28 +1,28 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
import trafilatura
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
def retrieve_article(url: str) -> str:
|
| 6 |
-
try:
|
| 7 |
-
page = trafilatura.fetch_url(url)
|
| 8 |
-
return trafilatura.extract(page)
|
| 9 |
-
except Exception as e:
|
| 10 |
-
print(e.args)
|
| 11 |
-
return None
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
|
| 15 |
-
# search DuckDuckGo for news on query
|
| 16 |
-
search_results = []
|
| 17 |
-
try:
|
| 18 |
-
search_results = DDGS().news(query, timelimit='d', max_results=max_results)
|
| 19 |
-
except Exception as e:
|
| 20 |
-
print(e.args)
|
| 21 |
-
|
| 22 |
-
# get article urls
|
| 23 |
-
urls = set([r['url'] for r in search_results])
|
| 24 |
-
|
| 25 |
-
# try to retrieve articles (filter inaccessible and short articles)
|
| 26 |
-
articles = [(article, url) for url in urls if (article := retrieve_article(url)) is not None and len(article) > min_article_length]
|
| 27 |
-
|
| 28 |
return articles
|
|
|
|
| 1 |
+
from ddgs import DDGS
|
| 2 |
+
import trafilatura
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def retrieve_article(url: str) -> str:
|
| 6 |
+
try:
|
| 7 |
+
page = trafilatura.fetch_url(url)
|
| 8 |
+
return trafilatura.extract(page)
|
| 9 |
+
except Exception as e:
|
| 10 |
+
print(e.args)
|
| 11 |
+
return None
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
|
| 15 |
+
# search DuckDuckGo for news on query
|
| 16 |
+
search_results = []
|
| 17 |
+
try:
|
| 18 |
+
search_results = DDGS().news(query, timelimit='d', max_results=max_results)
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(e.args)
|
| 21 |
+
|
| 22 |
+
# get article urls
|
| 23 |
+
urls = set([r['url'] for r in search_results])
|
| 24 |
+
|
| 25 |
+
# try to retrieve articles (filter inaccessible and short articles)
|
| 26 |
+
articles = [(article, url) for url in urls if (article := retrieve_article(url)) is not None and len(article) > min_article_length]
|
| 27 |
+
|
| 28 |
return articles
|
app/summarizer.py
CHANGED
|
@@ -1,120 +1,120 @@
|
|
| 1 |
-
import nltk
|
| 2 |
-
from nltk import tokenize
|
| 3 |
-
from sklearn.cluster import HDBSCAN
|
| 4 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
-
import numpy as np
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class Summarizer:
|
| 9 |
-
def __init__(self, embedding_model, summarization_model):
|
| 10 |
-
nltk.download('punkt_tab') # for tokenizing into sentences
|
| 11 |
-
|
| 12 |
-
self.embedding_model = embedding_model
|
| 13 |
-
self.summarization_model = summarization_model
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def cluster_sentences(self, sentences, embeddings, min_cluster_size=2):
|
| 17 |
-
hdb = HDBSCAN(min_cluster_size=min_cluster_size).fit(embeddings)
|
| 18 |
-
|
| 19 |
-
clusters = {}
|
| 20 |
-
for i, sentence in enumerate(sentences):
|
| 21 |
-
cluster_id = hdb.labels_[i]
|
| 22 |
-
if cluster_id == -1: # discard "noise"
|
| 23 |
-
continue
|
| 24 |
-
if cluster_id not in clusters:
|
| 25 |
-
clusters[cluster_id] = []
|
| 26 |
-
clusters[cluster_id].append((sentence, embeddings[i]))
|
| 27 |
-
|
| 28 |
-
return list(clusters.values())
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def create_embeddings(self, sentences):
|
| 32 |
-
return self.embedding_model.encode(sentences)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def summarize(self, content, min_length=30, max_length=130):
|
| 36 |
-
# truncate content if exceeds model capabilities
|
| 37 |
-
max_model_length = self.summarization_model.model.config.max_position_embeddings
|
| 38 |
-
if len(content) > max_model_length:
|
| 39 |
-
content = content[:max_model_length]
|
| 40 |
-
|
| 41 |
-
max_length = max(min_length, min(len(content), max_length))
|
| 42 |
-
|
| 43 |
-
return self.summarization_model(
|
| 44 |
-
content,
|
| 45 |
-
min_length=min_length,
|
| 46 |
-
max_length=max_length,
|
| 47 |
-
do_sample=False)[0]['summary_text']
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
@staticmethod
|
| 51 |
-
def rank_cluster_sentences(cluster):
|
| 52 |
-
# separate sentences and embeddings
|
| 53 |
-
sentences = [entry[0] for entry in cluster]
|
| 54 |
-
embeddings = [entry[1] for entry in cluster]
|
| 55 |
-
# find center of cluster
|
| 56 |
-
center = np.mean(embeddings, axis=0)
|
| 57 |
-
# score sentences by similarity to center
|
| 58 |
-
scores = cosine_similarity([center], embeddings)[0]
|
| 59 |
-
# rank sentences by score
|
| 60 |
-
ranked_sentences = [sentence for _, sentence in sorted(zip(scores, sentences), reverse=True)]
|
| 61 |
-
return ranked_sentences
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def summarize_clusters(self, clusters, top_cluster_count=5):
|
| 65 |
-
# sort clusters by their length (descending) to find the most important topics
|
| 66 |
-
clusters = sorted(clusters, key=len, reverse=True)
|
| 67 |
-
|
| 68 |
-
# take only the top_cluster_count clusters
|
| 69 |
-
clusters = clusters[:top_cluster_count]
|
| 70 |
-
|
| 71 |
-
# combine key sentences from each cluster
|
| 72 |
-
key_sentences = []
|
| 73 |
-
for i, cluster in enumerate(clusters):
|
| 74 |
-
print(f'Extracting from cluster {i + 1}...')
|
| 75 |
-
top_sentences = Summarizer.rank_cluster_sentences(cluster)
|
| 76 |
-
content = '\n'.join(top_sentences[:3])
|
| 77 |
-
key_sentences.append(content)
|
| 78 |
-
|
| 79 |
-
combined = ' '.join(key_sentences)
|
| 80 |
-
|
| 81 |
-
# summarize all key sentences
|
| 82 |
-
print('Creating response...')
|
| 83 |
-
summary = self.summarize(
|
| 84 |
-
combined,
|
| 85 |
-
min_length=60,
|
| 86 |
-
max_length=250
|
| 87 |
-
)
|
| 88 |
-
return summary
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
def combined_summary(self, articles, min_cluster_size=3):
|
| 92 |
-
if not articles:
|
| 93 |
-
return 'No articles to summarize.'
|
| 94 |
-
|
| 95 |
-
print('Tokenizing into sentences...')
|
| 96 |
-
# create a list of all sentences from all articles
|
| 97 |
-
sentences = []
|
| 98 |
-
for article in articles:
|
| 99 |
-
sentences.extend(tokenize.sent_tokenize(str.strip(article)))
|
| 100 |
-
|
| 101 |
-
# remove duplicate sentences
|
| 102 |
-
sentences = sorted(list(set(sentences)), key=sentences.index)
|
| 103 |
-
print(f'Found {len(sentences)} unique sentences')
|
| 104 |
-
|
| 105 |
-
if not sentences:
|
| 106 |
-
return None
|
| 107 |
-
|
| 108 |
-
print('Creating sentence embeddings...')
|
| 109 |
-
# create embeddings
|
| 110 |
-
embeddings = self.create_embeddings(sentences)
|
| 111 |
-
|
| 112 |
-
print('Grouping sentences into clusters...')
|
| 113 |
-
# group (embeddings of) sentences by similarity
|
| 114 |
-
clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
|
| 115 |
-
print(f'Created {len(clusters)} clusters')
|
| 116 |
-
|
| 117 |
-
# summarize all clusters into a single summary
|
| 118 |
-
summary = self.summarize_clusters(clusters)
|
| 119 |
-
|
| 120 |
return summary
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
from nltk import tokenize
|
| 3 |
+
from sklearn.cluster import HDBSCAN
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Summarizer:
|
| 9 |
+
def __init__(self, embedding_model, summarization_model):
|
| 10 |
+
nltk.download('punkt_tab') # for tokenizing into sentences
|
| 11 |
+
|
| 12 |
+
self.embedding_model = embedding_model
|
| 13 |
+
self.summarization_model = summarization_model
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def cluster_sentences(self, sentences, embeddings, min_cluster_size=2):
|
| 17 |
+
hdb = HDBSCAN(min_cluster_size=min_cluster_size).fit(embeddings)
|
| 18 |
+
|
| 19 |
+
clusters = {}
|
| 20 |
+
for i, sentence in enumerate(sentences):
|
| 21 |
+
cluster_id = hdb.labels_[i]
|
| 22 |
+
if cluster_id == -1: # discard "noise"
|
| 23 |
+
continue
|
| 24 |
+
if cluster_id not in clusters:
|
| 25 |
+
clusters[cluster_id] = []
|
| 26 |
+
clusters[cluster_id].append((sentence, embeddings[i]))
|
| 27 |
+
|
| 28 |
+
return list(clusters.values())
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def create_embeddings(self, sentences):
|
| 32 |
+
return self.embedding_model.encode(sentences)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def summarize(self, content, min_length=30, max_length=130):
|
| 36 |
+
# truncate content if exceeds model capabilities
|
| 37 |
+
max_model_length = self.summarization_model.model.config.max_position_embeddings
|
| 38 |
+
if len(content) > max_model_length:
|
| 39 |
+
content = content[:max_model_length]
|
| 40 |
+
|
| 41 |
+
max_length = max(min_length, min(len(content), max_length))
|
| 42 |
+
|
| 43 |
+
return self.summarization_model(
|
| 44 |
+
content,
|
| 45 |
+
min_length=min_length,
|
| 46 |
+
max_length=max_length,
|
| 47 |
+
do_sample=False)[0]['summary_text']
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@staticmethod
|
| 51 |
+
def rank_cluster_sentences(cluster):
|
| 52 |
+
# separate sentences and embeddings
|
| 53 |
+
sentences = [entry[0] for entry in cluster]
|
| 54 |
+
embeddings = [entry[1] for entry in cluster]
|
| 55 |
+
# find center of cluster
|
| 56 |
+
center = np.mean(embeddings, axis=0)
|
| 57 |
+
# score sentences by similarity to center
|
| 58 |
+
scores = cosine_similarity([center], embeddings)[0]
|
| 59 |
+
# rank sentences by score
|
| 60 |
+
ranked_sentences = [sentence for _, sentence in sorted(zip(scores, sentences), reverse=True)]
|
| 61 |
+
return ranked_sentences
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def summarize_clusters(self, clusters, top_cluster_count=5):
|
| 65 |
+
# sort clusters by their length (descending) to find the most important topics
|
| 66 |
+
clusters = sorted(clusters, key=len, reverse=True)
|
| 67 |
+
|
| 68 |
+
# take only the top_cluster_count clusters
|
| 69 |
+
clusters = clusters[:top_cluster_count]
|
| 70 |
+
|
| 71 |
+
# combine key sentences from each cluster
|
| 72 |
+
key_sentences = []
|
| 73 |
+
for i, cluster in enumerate(clusters):
|
| 74 |
+
print(f'Extracting from cluster {i + 1}...')
|
| 75 |
+
top_sentences = Summarizer.rank_cluster_sentences(cluster)
|
| 76 |
+
content = '\n'.join(top_sentences[:3])
|
| 77 |
+
key_sentences.append(content)
|
| 78 |
+
|
| 79 |
+
combined = ' '.join(key_sentences)
|
| 80 |
+
|
| 81 |
+
# summarize all key sentences
|
| 82 |
+
print('Creating response...')
|
| 83 |
+
summary = self.summarize(
|
| 84 |
+
combined,
|
| 85 |
+
min_length=60,
|
| 86 |
+
max_length=250
|
| 87 |
+
)
|
| 88 |
+
return summary
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def combined_summary(self, articles, min_cluster_size=3):
|
| 92 |
+
if not articles:
|
| 93 |
+
return 'No articles to summarize.'
|
| 94 |
+
|
| 95 |
+
print('Tokenizing into sentences...')
|
| 96 |
+
# create a list of all sentences from all articles
|
| 97 |
+
sentences = []
|
| 98 |
+
for article in articles:
|
| 99 |
+
sentences.extend(tokenize.sent_tokenize(str.strip(article)))
|
| 100 |
+
|
| 101 |
+
# remove duplicate sentences
|
| 102 |
+
sentences = sorted(list(set(sentences)), key=sentences.index)
|
| 103 |
+
print(f'Found {len(sentences)} unique sentences')
|
| 104 |
+
|
| 105 |
+
if not sentences:
|
| 106 |
+
return None
|
| 107 |
+
|
| 108 |
+
print('Creating sentence embeddings...')
|
| 109 |
+
# create embeddings
|
| 110 |
+
embeddings = self.create_embeddings(sentences)
|
| 111 |
+
|
| 112 |
+
print('Grouping sentences into clusters...')
|
| 113 |
+
# group (embeddings of) sentences by similarity
|
| 114 |
+
clusters = self.cluster_sentences(sentences, embeddings, min_cluster_size=min_cluster_size)
|
| 115 |
+
print(f'Created {len(clusters)} clusters')
|
| 116 |
+
|
| 117 |
+
# summarize all clusters into a single summary
|
| 118 |
+
summary = self.summarize_clusters(clusters)
|
| 119 |
+
|
| 120 |
return summary
|
app/templates/index.html
CHANGED
|
@@ -1,123 +1,123 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset="UTF-8">
|
| 5 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
-
<title>Focal</title>
|
| 7 |
-
<script src="https://cdn.tailwindcss.com"></script>
|
| 8 |
-
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
-
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
-
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@600&family=Inter:wght@400;500&display=swap" rel="stylesheet">
|
| 11 |
-
<script>
|
| 12 |
-
tailwind.config = {
|
| 13 |
-
theme: {
|
| 14 |
-
extend: {
|
| 15 |
-
fontFamily: {
|
| 16 |
-
sans: ['Inter', 'sans-serif'],
|
| 17 |
-
display: ['Poppins', 'sans-serif'],
|
| 18 |
-
},
|
| 19 |
-
colors: {
|
| 20 |
-
'background': '#FFFFFF',
|
| 21 |
-
'background-light': '#F9F9F9',
|
| 22 |
-
'text-primary': '#121212',
|
| 23 |
-
'primary': '#2563EB',
|
| 24 |
-
'primary-dark': '#1E40AF',
|
| 25 |
-
}
|
| 26 |
-
}
|
| 27 |
-
}
|
| 28 |
-
}
|
| 29 |
-
</script>
|
| 30 |
-
<style>
|
| 31 |
-
.collapsible-content {
|
| 32 |
-
max-height: 0;
|
| 33 |
-
overflow: hidden;
|
| 34 |
-
transition: max-height 0.5s cubic-bezier(0.4, 0, 0.2, 1);
|
| 35 |
-
}
|
| 36 |
-
.collapsible-content.expanded {
|
| 37 |
-
max-height: 500px;
|
| 38 |
-
}
|
| 39 |
-
.collapsible-content .border {
|
| 40 |
-
border-color: #E5E7EB;
|
| 41 |
-
}
|
| 42 |
-
</style>
|
| 43 |
-
</head>
|
| 44 |
-
<body class="bg-background text-text-primary font-sans antialiased">
|
| 45 |
-
|
| 46 |
-
<header class="container mx-auto px-6 py-5">
|
| 47 |
-
<div class="flex items-center border-b border-primary/20 pb-3">
|
| 48 |
-
<svg class="h-8 w-8 mr-2" viewBox="0 0 32 32" fill="black" xmlns="http://www.w3.org/2000/svg">
|
| 49 |
-
<circle cx="16" cy="16" r="10" />
|
| 50 |
-
</svg>
|
| 51 |
-
<a href="#" class="font-display text-2xl tracking-widest uppercase">Focal</a>
|
| 52 |
-
</div>
|
| 53 |
-
</header>
|
| 54 |
-
|
| 55 |
-
<main class="container mx-auto px-6 py-12 max-w-4xl">
|
| 56 |
-
|
| 57 |
-
{% if message %}
|
| 58 |
-
<div class="text-center text-lg text-text-primary">
|
| 59 |
-
{{ message }}
|
| 60 |
-
</div>
|
| 61 |
-
{% endif %}
|
| 62 |
-
|
| 63 |
-
{% for article in articles %}
|
| 64 |
-
<article class="mb-20 md:mb-24">
|
| 65 |
-
<p class="text-sm font-medium text-primary uppercase tracking-wider mb-3">{{ article.date }}</p>
|
| 66 |
-
<h1 class="font-display font-semibold text-2xl md:text-3xl text-text-primary mb-6 leading-tight">
|
| 67 |
-
{{ article.title }}
|
| 68 |
-
</h1>
|
| 69 |
-
<div class="text-lg text-text-primary/80 leading-relaxed space-y-5">
|
| 70 |
-
<p>
|
| 71 |
-
{{ article.content }}
|
| 72 |
-
</p>
|
| 73 |
-
</div>
|
| 74 |
-
{% if article.sources %}
|
| 75 |
-
<div class="mt-8">
|
| 76 |
-
<button class="toggle-button flex items-center text-primary font-medium text-sm transition-colors hover:text-text-primary">
|
| 77 |
-
<span class="mr-2">Further Reading</span>
|
| 78 |
-
<svg class="plus-icon h-4 w-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M12 4.5v15m7.5-7.5h-15" /></svg>
|
| 79 |
-
<svg class="minus-icon h-4 w-4 hidden" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M19.5 12h-15" /></svg>
|
| 80 |
-
</button>
|
| 81 |
-
|
| 82 |
-
<div class="collapsible-content mt-4">
|
| 83 |
-
<div class="bg-background-light p-5 rounded-md border border-gray-700/50">
|
| 84 |
-
<ul class="space-y-3 list-disc list-inside text-text-primary/70">
|
| 85 |
-
{% for source in article.sources %}
|
| 86 |
-
<li><a href="{{ source.url }}" class="hover:text-primary transition-colors">{{ source.url }}</a></li>
|
| 87 |
-
{% endfor %}
|
| 88 |
-
</ul>
|
| 89 |
-
</div>
|
| 90 |
-
</div>
|
| 91 |
-
</div>
|
| 92 |
-
{% endif %}
|
| 93 |
-
</article>
|
| 94 |
-
{% endfor %}
|
| 95 |
-
|
| 96 |
-
</main>
|
| 97 |
-
|
| 98 |
-
<footer class="container mx-auto px-6 py-4 text-center text-sm text-text-primary/60">
|
| 99 |
-
<p>All content is AI-generated, and may contain inaccuracies.</p>
|
| 100 |
-
</footer>
|
| 101 |
-
|
| 102 |
-
<script>
|
| 103 |
-
// JavaScript for the collapsible sections
|
| 104 |
-
document.addEventListener('DOMContentLoaded', function () {
|
| 105 |
-
const allToggleButtons = document.querySelectorAll('.toggle-button');
|
| 106 |
-
|
| 107 |
-
allToggleButtons.forEach(button => {
|
| 108 |
-
button.addEventListener('click', function () {
|
| 109 |
-
const content = this.nextElementSibling;
|
| 110 |
-
const plusIcon = this.querySelector('.plus-icon');
|
| 111 |
-
const minusIcon = this.querySelector('.minus-icon');
|
| 112 |
-
|
| 113 |
-
content.classList.toggle('expanded');
|
| 114 |
-
|
| 115 |
-
plusIcon.classList.toggle('hidden');
|
| 116 |
-
minusIcon.classList.toggle('hidden');
|
| 117 |
-
});
|
| 118 |
-
});
|
| 119 |
-
});
|
| 120 |
-
</script>
|
| 121 |
-
|
| 122 |
-
</body>
|
| 123 |
</html>
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Focal</title>
|
| 7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 8 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
+
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@600&family=Inter:wght@400;500&display=swap" rel="stylesheet">
|
| 11 |
+
<script>
|
| 12 |
+
tailwind.config = {
|
| 13 |
+
theme: {
|
| 14 |
+
extend: {
|
| 15 |
+
fontFamily: {
|
| 16 |
+
sans: ['Inter', 'sans-serif'],
|
| 17 |
+
display: ['Poppins', 'sans-serif'],
|
| 18 |
+
},
|
| 19 |
+
colors: {
|
| 20 |
+
'background': '#FFFFFF',
|
| 21 |
+
'background-light': '#F9F9F9',
|
| 22 |
+
'text-primary': '#121212',
|
| 23 |
+
'primary': '#2563EB',
|
| 24 |
+
'primary-dark': '#1E40AF',
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
</script>
|
| 30 |
+
<style>
|
| 31 |
+
.collapsible-content {
|
| 32 |
+
max-height: 0;
|
| 33 |
+
overflow: hidden;
|
| 34 |
+
transition: max-height 0.5s cubic-bezier(0.4, 0, 0.2, 1);
|
| 35 |
+
}
|
| 36 |
+
.collapsible-content.expanded {
|
| 37 |
+
max-height: 500px;
|
| 38 |
+
}
|
| 39 |
+
.collapsible-content .border {
|
| 40 |
+
border-color: #E5E7EB;
|
| 41 |
+
}
|
| 42 |
+
</style>
|
| 43 |
+
</head>
|
| 44 |
+
<body class="bg-background text-text-primary font-sans antialiased">
|
| 45 |
+
|
| 46 |
+
<header class="container mx-auto px-6 py-5">
|
| 47 |
+
<div class="flex items-center border-b border-primary/20 pb-3">
|
| 48 |
+
<svg class="h-8 w-8 mr-2" viewBox="0 0 32 32" fill="black" xmlns="http://www.w3.org/2000/svg">
|
| 49 |
+
<circle cx="16" cy="16" r="10" />
|
| 50 |
+
</svg>
|
| 51 |
+
<a href="#" class="font-display text-2xl tracking-widest uppercase">Focal</a>
|
| 52 |
+
</div>
|
| 53 |
+
</header>
|
| 54 |
+
|
| 55 |
+
<main class="container mx-auto px-6 py-12 max-w-4xl">
|
| 56 |
+
|
| 57 |
+
{% if message %}
|
| 58 |
+
<div class="text-center text-lg text-text-primary">
|
| 59 |
+
{{ message }}
|
| 60 |
+
</div>
|
| 61 |
+
{% endif %}
|
| 62 |
+
|
| 63 |
+
{% for article in articles %}
|
| 64 |
+
<article class="mb-20 md:mb-24">
|
| 65 |
+
<p class="text-sm font-medium text-primary uppercase tracking-wider mb-3">{{ article.date }}</p>
|
| 66 |
+
<h1 class="font-display font-semibold text-2xl md:text-3xl text-text-primary mb-6 leading-tight">
|
| 67 |
+
{{ article.title }}
|
| 68 |
+
</h1>
|
| 69 |
+
<div class="text-lg text-text-primary/80 leading-relaxed space-y-5">
|
| 70 |
+
<p>
|
| 71 |
+
{{ article.content }}
|
| 72 |
+
</p>
|
| 73 |
+
</div>
|
| 74 |
+
{% if article.sources %}
|
| 75 |
+
<div class="mt-8">
|
| 76 |
+
<button class="toggle-button flex items-center text-primary font-medium text-sm transition-colors hover:text-text-primary">
|
| 77 |
+
<span class="mr-2">Further Reading</span>
|
| 78 |
+
<svg class="plus-icon h-4 w-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M12 4.5v15m7.5-7.5h-15" /></svg>
|
| 79 |
+
<svg class="minus-icon h-4 w-4 hidden" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M19.5 12h-15" /></svg>
|
| 80 |
+
</button>
|
| 81 |
+
|
| 82 |
+
<div class="collapsible-content mt-4">
|
| 83 |
+
<div class="bg-background-light p-5 rounded-md border border-gray-700/50">
|
| 84 |
+
<ul class="space-y-3 list-disc list-inside text-text-primary/70">
|
| 85 |
+
{% for source in article.sources %}
|
| 86 |
+
<li><a href="{{ source.url }}" class="hover:text-primary transition-colors">{{ source.url }}</a></li>
|
| 87 |
+
{% endfor %}
|
| 88 |
+
</ul>
|
| 89 |
+
</div>
|
| 90 |
+
</div>
|
| 91 |
+
</div>
|
| 92 |
+
{% endif %}
|
| 93 |
+
</article>
|
| 94 |
+
{% endfor %}
|
| 95 |
+
|
| 96 |
+
</main>
|
| 97 |
+
|
| 98 |
+
<footer class="container mx-auto px-6 py-4 text-center text-sm text-text-primary/60">
|
| 99 |
+
<p>All content is AI-generated, and may contain inaccuracies.</p>
|
| 100 |
+
</footer>
|
| 101 |
+
|
| 102 |
+
<script>
|
| 103 |
+
// JavaScript for the collapsible sections
|
| 104 |
+
document.addEventListener('DOMContentLoaded', function () {
|
| 105 |
+
const allToggleButtons = document.querySelectorAll('.toggle-button');
|
| 106 |
+
|
| 107 |
+
allToggleButtons.forEach(button => {
|
| 108 |
+
button.addEventListener('click', function () {
|
| 109 |
+
const content = this.nextElementSibling;
|
| 110 |
+
const plusIcon = this.querySelector('.plus-icon');
|
| 111 |
+
const minusIcon = this.querySelector('.minus-icon');
|
| 112 |
+
|
| 113 |
+
content.classList.toggle('expanded');
|
| 114 |
+
|
| 115 |
+
plusIcon.classList.toggle('hidden');
|
| 116 |
+
minusIcon.classList.toggle('hidden');
|
| 117 |
+
});
|
| 118 |
+
});
|
| 119 |
+
});
|
| 120 |
+
</script>
|
| 121 |
+
|
| 122 |
+
</body>
|
| 123 |
</html>
|
app/update_news.py
CHANGED
|
@@ -1,50 +1,50 @@
|
|
| 1 |
-
from sentence_transformers import SentenceTransformer
|
| 2 |
-
from transformers import pipeline
|
| 3 |
-
from .summarizer import Summarizer
|
| 4 |
-
from .news_fetcher import news_summary
|
| 5 |
-
from .database import get_session, clear_articles, add_article, add_sources
|
| 6 |
-
import datetime
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
|
| 10 |
-
try:
|
| 11 |
-
with open(filename, 'r') as f:
|
| 12 |
-
urls = [url.strip() for url in f.readlines()]
|
| 13 |
-
return urls
|
| 14 |
-
except:
|
| 15 |
-
return None
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def update_news():
|
| 20 |
-
print(f'Initiating news update: {datetime.datetime.now()}')
|
| 21 |
-
|
| 22 |
-
# model to create embeddings from sentences
|
| 23 |
-
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 24 |
-
|
| 25 |
-
# model to summarize text
|
| 26 |
-
summarizing_model = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 27 |
-
|
| 28 |
-
# wraps models
|
| 29 |
-
summarizer = Summarizer(embedding_model=embedding_model, summarization_model=summarizing_model)
|
| 30 |
-
|
| 31 |
-
# rss feeds for current headlines
|
| 32 |
-
rss_feed_urls = read_rss_feed_urls()
|
| 33 |
-
|
| 34 |
-
# clear old articles
|
| 35 |
-
with get_session() as session:
|
| 36 |
-
clear_articles(session)
|
| 37 |
-
|
| 38 |
-
# summarize articles and add to database
|
| 39 |
-
for article, sources in news_summary(summarizer, rss_feed_urls):
|
| 40 |
-
with get_session() as session:
|
| 41 |
-
add_article(session, article)
|
| 42 |
-
|
| 43 |
-
for source in sources:
|
| 44 |
-
source.article_id = article.id
|
| 45 |
-
|
| 46 |
-
add_sources(session, sources)
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
if __name__ == '__main__':
|
| 50 |
-
update_news()
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
from .summarizer import Summarizer
|
| 4 |
+
from .news_fetcher import news_summary
|
| 5 |
+
from .database import get_session, clear_articles, add_article, add_sources
|
| 6 |
+
import datetime
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def read_rss_feed_urls(filename : str ='rss_feeds.txt') -> list[str]:
|
| 10 |
+
try:
|
| 11 |
+
with open(filename, 'r') as f:
|
| 12 |
+
urls = [url.strip() for url in f.readlines()]
|
| 13 |
+
return urls
|
| 14 |
+
except:
|
| 15 |
+
return None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def update_news():
|
| 20 |
+
print(f'Initiating news update: {datetime.datetime.now()}')
|
| 21 |
+
|
| 22 |
+
# model to create embeddings from sentences
|
| 23 |
+
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 24 |
+
|
| 25 |
+
# model to summarize text
|
| 26 |
+
summarizing_model = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 27 |
+
|
| 28 |
+
# wraps models
|
| 29 |
+
summarizer = Summarizer(embedding_model=embedding_model, summarization_model=summarizing_model)
|
| 30 |
+
|
| 31 |
+
# rss feeds for current headlines
|
| 32 |
+
rss_feed_urls = read_rss_feed_urls()
|
| 33 |
+
|
| 34 |
+
# clear old articles
|
| 35 |
+
with get_session() as session:
|
| 36 |
+
clear_articles(session)
|
| 37 |
+
|
| 38 |
+
# summarize articles and add to database
|
| 39 |
+
for article, sources in news_summary(summarizer, rss_feed_urls):
|
| 40 |
+
with get_session() as session:
|
| 41 |
+
add_article(session, article)
|
| 42 |
+
|
| 43 |
+
for source in sources:
|
| 44 |
+
source.article_id = article.id
|
| 45 |
+
|
| 46 |
+
add_sources(session, sources)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if __name__ == '__main__':
|
| 50 |
+
update_news()
|
compose.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Comments are provided throughout this file to help you get started.
|
| 2 |
+
# If you need more help, visit the Docker Compose reference guide at
|
| 3 |
+
# https://docs.docker.com/go/compose-spec-reference/
|
| 4 |
+
|
| 5 |
+
# Here the instructions define your application as a service called "server".
|
| 6 |
+
# This service is built from the Dockerfile in the current directory.
|
| 7 |
+
# You can add other services your application may depend on here, such as a
|
| 8 |
+
# database or a cache. For examples, see the Awesome Compose repository:
|
| 9 |
+
# https://github.com/docker/awesome-compose
|
| 10 |
+
services:
|
| 11 |
+
server:
|
| 12 |
+
build:
|
| 13 |
+
context: .
|
| 14 |
+
ports:
|
| 15 |
+
- 7860:7860
|
| 16 |
+
|
| 17 |
+
# The commented out section below is an example of how to define a PostgreSQL
|
| 18 |
+
# database that your application can use. `depends_on` tells Docker Compose to
|
| 19 |
+
# start the database before your application. The `db-data` volume persists the
|
| 20 |
+
# database data between container restarts. The `db-password` secret is used
|
| 21 |
+
# to set the database password. You must create `db/password.txt` and add
|
| 22 |
+
# a password of your choosing to it before running `docker compose up`.
|
| 23 |
+
# depends_on:
|
| 24 |
+
# db:
|
| 25 |
+
# condition: service_healthy
|
| 26 |
+
# db:
|
| 27 |
+
# image: postgres
|
| 28 |
+
# restart: always
|
| 29 |
+
# user: postgres
|
| 30 |
+
# secrets:
|
| 31 |
+
# - db-password
|
| 32 |
+
# volumes:
|
| 33 |
+
# - db-data:/var/lib/postgresql/data
|
| 34 |
+
# environment:
|
| 35 |
+
# - POSTGRES_DB=example
|
| 36 |
+
# - POSTGRES_PASSWORD_FILE=/run/secrets/db-password
|
| 37 |
+
# expose:
|
| 38 |
+
# - 5432
|
| 39 |
+
# healthcheck:
|
| 40 |
+
# test: [ "CMD", "pg_isready" ]
|
| 41 |
+
# interval: 10s
|
| 42 |
+
# timeout: 5s
|
| 43 |
+
# retries: 5
|
| 44 |
+
# volumes:
|
| 45 |
+
# db-data:
|
| 46 |
+
# secrets:
|
| 47 |
+
# db-password:
|
| 48 |
+
# file: db/password.txt
|
| 49 |
+
|
rss_feeds.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
http://feeds.bbci.co.uk/news/world/rss.xml
|
| 2 |
-
http://rss.cnn.com/rss/cnn_topstories.rss
|
| 3 |
-
http://feeds.nytimes.com/nyt/rss/HomePage
|
| 4 |
https://www.theguardian.com/world/rss
|
|
|
|
| 1 |
+
http://feeds.bbci.co.uk/news/world/rss.xml
|
| 2 |
+
http://rss.cnn.com/rss/cnn_topstories.rss
|
| 3 |
+
http://feeds.nytimes.com/nyt/rss/HomePage
|
| 4 |
https://www.theguardian.com/world/rss
|
test.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from database import DatabaseConnection
|
| 2 |
-
db = DatabaseConnection()
|
| 3 |
-
print(db.retrieve_articles())
|
| 4 |
db.close()
|
|
|
|
| 1 |
+
from database import DatabaseConnection
|
| 2 |
+
db = DatabaseConnection()
|
| 3 |
+
print(db.retrieve_articles())
|
| 4 |
db.close()
|