Spaces:
Running
Running
Erik Sarriegui commited on
Commit ·
588462f
1
Parent(s): d3c610f
first push
Browse files- Dockerfile +44 -0
- frontend/index.html +16 -0
- frontend/package.json +29 -0
- frontend/postcss.config.js +6 -0
- frontend/src/App.jsx +78 -0
- frontend/src/components/Aggregator.jsx +108 -0
- frontend/src/components/Analyzer.jsx +141 -0
- frontend/src/components/ClusterCard.jsx +78 -0
- frontend/src/index.css +9 -0
- frontend/src/lib/utils.js +6 -0
- frontend/src/main.jsx +10 -0
- frontend/tailwind.config.js +22 -0
- frontend/vite.config.js +21 -0
- main.py +196 -0
- requirements.txt +12 -0
- src/dataset.parquet +3 -0
Dockerfile
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Stage 1: Build Frontend
|
| 2 |
+
FROM node:18-alpine as frontend-build
|
| 3 |
+
|
| 4 |
+
WORKDIR /app/frontend
|
| 5 |
+
|
| 6 |
+
COPY frontend/package.json ./
|
| 7 |
+
RUN npm install
|
| 8 |
+
|
| 9 |
+
COPY frontend/ ./
|
| 10 |
+
RUN npm run build
|
| 11 |
+
|
| 12 |
+
# Stage 2: Backend & Runtime
|
| 13 |
+
FROM python:3.10-slim
|
| 14 |
+
|
| 15 |
+
WORKDIR /app
|
| 16 |
+
|
| 17 |
+
# Install system dependencies
|
| 18 |
+
RUN apt-get update && apt-get install -y \
|
| 19 |
+
build-essential \
|
| 20 |
+
libxml2-dev \
|
| 21 |
+
libxslt-dev \
|
| 22 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 23 |
+
|
| 24 |
+
# Install Python dependencies
|
| 25 |
+
COPY requirements.txt .
|
| 26 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 27 |
+
|
| 28 |
+
# Copy backend code
|
| 29 |
+
COPY main.py .
|
| 30 |
+
COPY src/ src/
|
| 31 |
+
|
| 32 |
+
# Copy built frontend from Stage 1
|
| 33 |
+
COPY --from=frontend-build /app/frontend/dist /app/static
|
| 34 |
+
|
| 35 |
+
# Create cache directory for Hugging Face models to ensure they are writable
|
| 36 |
+
# HF Spaces run as user 1000
|
| 37 |
+
RUN mkdir -p /app/cache && chmod 777 /app/cache
|
| 38 |
+
ENV HF_HOME=/app/cache
|
| 39 |
+
|
| 40 |
+
# Expose port (7860 is default for HF Spaces)
|
| 41 |
+
EXPOSE 7860
|
| 42 |
+
|
| 43 |
+
# Run the app
|
| 44 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
frontend/index.html
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
+
<title>News Intelligence</title>
|
| 8 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
| 11 |
+
</head>
|
| 12 |
+
<body class="bg-slate-950 text-slate-50 font-sans">
|
| 13 |
+
<div id="root"></div>
|
| 14 |
+
<script type="module" src="/src/main.jsx"></script>
|
| 15 |
+
</body>
|
| 16 |
+
</html>
|
frontend/package.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "frontend",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "0.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "vite build",
|
| 9 |
+
"preview": "vite preview"
|
| 10 |
+
},
|
| 11 |
+
"dependencies": {
|
| 12 |
+
"axios": "^1.6.7",
|
| 13 |
+
"clsx": "^2.1.0",
|
| 14 |
+
"framer-motion": "^11.0.3",
|
| 15 |
+
"lucide-react": "^0.330.0",
|
| 16 |
+
"react": "^18.2.0",
|
| 17 |
+
"react-dom": "^18.2.0",
|
| 18 |
+
"tailwind-merge": "^2.2.1"
|
| 19 |
+
},
|
| 20 |
+
"devDependencies": {
|
| 21 |
+
"@types/react": "^18.2.55",
|
| 22 |
+
"@types/react-dom": "^18.2.19",
|
| 23 |
+
"@vitejs/plugin-react": "^4.2.1",
|
| 24 |
+
"autoprefixer": "^10.4.17",
|
| 25 |
+
"postcss": "^8.4.35",
|
| 26 |
+
"tailwindcss": "^3.4.1",
|
| 27 |
+
"vite": "^5.1.0"
|
| 28 |
+
}
|
| 29 |
+
}
|
frontend/postcss.config.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export default {
|
| 2 |
+
plugins: {
|
| 3 |
+
tailwindcss: {},
|
| 4 |
+
autoprefixer: {},
|
| 5 |
+
},
|
| 6 |
+
}
|
frontend/src/App.jsx
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from 'react';
|
| 2 |
+
import { Newspaper, Search, LayoutGrid } from 'lucide-react';
|
| 3 |
+
import { motion, AnimatePresence } from 'framer-motion';
|
| 4 |
+
import Aggregator from './components/Aggregator';
|
| 5 |
+
import Analyzer from './components/Analyzer';
|
| 6 |
+
import { cn } from './lib/utils';
|
| 7 |
+
|
| 8 |
+
function App() {
|
| 9 |
+
const [activeTab, setActiveTab] = useState('aggregator');
|
| 10 |
+
|
| 11 |
+
return (
|
| 12 |
+
<div className="min-h-screen bg-slate-950 text-slate-100 flex flex-col items-center py-10 px-4">
|
| 13 |
+
<header className="mb-8 text-center">
|
| 14 |
+
<h1 className="text-4xl font-bold bg-gradient-to-r from-blue-400 to-purple-500 bg-clip-text text-transparent mb-2">
|
| 15 |
+
News Intelligence
|
| 16 |
+
</h1>
|
| 17 |
+
<p className="text-slate-400">Advanced Aggregation & Analysis</p>
|
| 18 |
+
</header>
|
| 19 |
+
|
| 20 |
+
{/* Navigation */}
|
| 21 |
+
<div className="bg-slate-900/50 p-1 rounded-xl flex gap-1 mb-8 border border-slate-800 backdrop-blur-sm">
|
| 22 |
+
<button
|
| 23 |
+
onClick={() => setActiveTab('aggregator')}
|
| 24 |
+
className={cn(
|
| 25 |
+
"flex items-center gap-2 px-6 py-2.5 rounded-lg text-sm font-medium transition-all duration-300",
|
| 26 |
+
activeTab === 'aggregator'
|
| 27 |
+
? "bg-blue-600 text-white shadow-lg shadow-blue-900/20"
|
| 28 |
+
: "text-slate-400 hover:text-white hover:bg-slate-800"
|
| 29 |
+
)}
|
| 30 |
+
>
|
| 31 |
+
<LayoutGrid className="w-4 h-4" />
|
| 32 |
+
Event Stream
|
| 33 |
+
</button>
|
| 34 |
+
<button
|
| 35 |
+
onClick={() => setActiveTab('analyzer')}
|
| 36 |
+
className={cn(
|
| 37 |
+
"flex items-center gap-2 px-6 py-2.5 rounded-lg text-sm font-medium transition-all duration-300",
|
| 38 |
+
activeTab === 'analyzer'
|
| 39 |
+
? "bg-purple-600 text-white shadow-lg shadow-purple-900/20"
|
| 40 |
+
: "text-slate-400 hover:text-white hover:bg-slate-800"
|
| 41 |
+
)}
|
| 42 |
+
>
|
| 43 |
+
<Search className="w-4 h-4" />
|
| 44 |
+
Analyzer
|
| 45 |
+
</button>
|
| 46 |
+
</div>
|
| 47 |
+
|
| 48 |
+
{/* Content Area */}
|
| 49 |
+
<main className="w-full max-w-5xl">
|
| 50 |
+
<AnimatePresence mode="wait">
|
| 51 |
+
{activeTab === 'aggregator' ? (
|
| 52 |
+
<motion.div
|
| 53 |
+
key="aggregator"
|
| 54 |
+
initial={{ opacity: 0, y: 20 }}
|
| 55 |
+
animate={{ opacity: 1, y: 0 }}
|
| 56 |
+
exit={{ opacity: 0, y: -20 }}
|
| 57 |
+
transition={{ duration: 0.3 }}
|
| 58 |
+
>
|
| 59 |
+
<Aggregator />
|
| 60 |
+
</motion.div>
|
| 61 |
+
) : (
|
| 62 |
+
<motion.div
|
| 63 |
+
key="analyzer"
|
| 64 |
+
initial={{ opacity: 0, y: 20 }}
|
| 65 |
+
animate={{ opacity: 1, y: 0 }}
|
| 66 |
+
exit={{ opacity: 0, y: -20 }}
|
| 67 |
+
transition={{ duration: 0.3 }}
|
| 68 |
+
>
|
| 69 |
+
<Analyzer />
|
| 70 |
+
</motion.div>
|
| 71 |
+
)}
|
| 72 |
+
</AnimatePresence>
|
| 73 |
+
</main>
|
| 74 |
+
</div>
|
| 75 |
+
);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
export default App;
|
frontend/src/components/Aggregator.jsx
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useEffect } from 'react';
|
| 2 |
+
import axios from 'axios';
|
| 3 |
+
import { Calendar, RefreshCw } from 'lucide-react';
|
| 4 |
+
import ClusterCard from './ClusterCard';
|
| 5 |
+
import { cn } from '../lib/utils';
|
| 6 |
+
import { motion } from 'framer-motion';
|
| 7 |
+
|
| 8 |
+
export default function Aggregator() {
|
| 9 |
+
const [data, setData] = useState(null);
|
| 10 |
+
const [loading, setLoading] = useState(true);
|
| 11 |
+
const [error, setError] = useState(null);
|
| 12 |
+
const [selectedDayIndex, setSelectedDayIndex] = useState(0);
|
| 13 |
+
|
| 14 |
+
useEffect(() => {
|
| 15 |
+
fetchData();
|
| 16 |
+
}, []);
|
| 17 |
+
|
| 18 |
+
const fetchData = async () => {
|
| 19 |
+
try {
|
| 20 |
+
setLoading(true);
|
| 21 |
+
setError(null);
|
| 22 |
+
const response = await axios.get('/api/clusters');
|
| 23 |
+
|
| 24 |
+
// Sort daysdescending just in case or keep as valid from API
|
| 25 |
+
// API returns top 3 days. We'll use them as is.
|
| 26 |
+
setData(response.data);
|
| 27 |
+
} catch (err) {
|
| 28 |
+
console.error(err);
|
| 29 |
+
setError("Failed to load clusters. Please try again.");
|
| 30 |
+
} finally {
|
| 31 |
+
setLoading(false);
|
| 32 |
+
}
|
| 33 |
+
};
|
| 34 |
+
|
| 35 |
+
if (loading) {
|
| 36 |
+
return (
|
| 37 |
+
<div className="flex flex-col items-center justify-center py-20 text-slate-500">
|
| 38 |
+
<RefreshCw className="w-8 h-8 animate-spin mb-4 text-blue-500" />
|
| 39 |
+
<p>Loading intelligence stream...</p>
|
| 40 |
+
</div>
|
| 41 |
+
);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
if (error) {
|
| 45 |
+
return (
|
| 46 |
+
<div className="text-center py-20 text-red-400 bg-red-900/10 rounded-xl border border-red-900/20">
|
| 47 |
+
<p className="mb-4">{error}</p>
|
| 48 |
+
<button
|
| 49 |
+
onClick={fetchData}
|
| 50 |
+
className="px-4 py-2 bg-red-600 text-white rounded-lg hover:bg-red-500 transition-colors"
|
| 51 |
+
>
|
| 52 |
+
Retry
|
| 53 |
+
</button>
|
| 54 |
+
</div>
|
| 55 |
+
);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
if (!data || data.length === 0) {
|
| 59 |
+
return <div className="text-center py-20 text-slate-500">No data available.</div>;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
const currentDay = data[selectedDayIndex];
|
| 63 |
+
|
| 64 |
+
return (
|
| 65 |
+
<div className="space-y-6">
|
| 66 |
+
{/* Day Tabs */}
|
| 67 |
+
<div className="flex justify-center gap-4">
|
| 68 |
+
{data.map((dayItem, index) => (
|
| 69 |
+
<button
|
| 70 |
+
key={dayItem.date}
|
| 71 |
+
onClick={() => setSelectedDayIndex(index)}
|
| 72 |
+
className={cn(
|
| 73 |
+
"flex flex-col items-center p-3 rounded-xl border transition-all duration-300 w-32",
|
| 74 |
+
selectedDayIndex === index
|
| 75 |
+
? "bg-slate-800 border-blue-500/50 shadow-lg shadow-blue-500/10 text-white"
|
| 76 |
+
: "bg-slate-900/50 border-slate-800 text-slate-400 hover:bg-slate-800 hover:border-slate-700"
|
| 77 |
+
)}
|
| 78 |
+
>
|
| 79 |
+
<span className="text-xs font-medium uppercase tracking-wider text-slate-500 mb-1">
|
| 80 |
+
{new Date(dayItem.date).toLocaleDateString(undefined, { weekday: 'short' })}
|
| 81 |
+
</span>
|
| 82 |
+
<span className="text-lg font-bold">
|
| 83 |
+
{new Date(dayItem.date).toLocaleDateString(undefined, { day: '2-digit', month: 'short' })}
|
| 84 |
+
</span>
|
| 85 |
+
</button>
|
| 86 |
+
))}
|
| 87 |
+
</div>
|
| 88 |
+
|
| 89 |
+
{/* Stats for the day */}
|
| 90 |
+
<div className="flex items-center justify-between px-4 pb-2 text-slate-400 text-sm border-b border-slate-800">
|
| 91 |
+
<div className="flex items-center gap-2">
|
| 92 |
+
<Calendar className="w-4 h-4" />
|
| 93 |
+
<span>Events for {new Date(currentDay.date).toLocaleDateString()}</span>
|
| 94 |
+
</div>
|
| 95 |
+
<div>
|
| 96 |
+
{currentDay.clusters.length} Event Clusters
|
| 97 |
+
</div>
|
| 98 |
+
</div>
|
| 99 |
+
|
| 100 |
+
{/* Clusters Grid */}
|
| 101 |
+
<div className="grid grid-cols-1 md:grid-cols-1 gap-4">
|
| 102 |
+
{currentDay.clusters.map((cluster) => (
|
| 103 |
+
<ClusterCard key={cluster.cluster_id} cluster={cluster} />
|
| 104 |
+
))}
|
| 105 |
+
</div>
|
| 106 |
+
</div>
|
| 107 |
+
);
|
| 108 |
+
}
|
frontend/src/components/Analyzer.jsx
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from 'react';
|
| 2 |
+
import axios from 'axios';
|
| 3 |
+
import { Search, AlertTriangle, CheckCircle, AlertOctagon, ArrowRight, Loader2 } from 'lucide-react';
|
| 4 |
+
import { motion } from 'framer-motion';
|
| 5 |
+
import { cn } from '../lib/utils';
|
| 6 |
+
|
| 7 |
+
export default function Analyzer() {
|
| 8 |
+
const [url, setUrl] = useState('');
|
| 9 |
+
const [result, setResult] = useState(null);
|
| 10 |
+
const [loading, setLoading] = useState(false);
|
| 11 |
+
const [error, setError] = useState(null);
|
| 12 |
+
|
| 13 |
+
const handleAnalyze = async (e) => {
|
| 14 |
+
e.preventDefault();
|
| 15 |
+
if (!url) return;
|
| 16 |
+
|
| 17 |
+
setLoading(true);
|
| 18 |
+
setError(null);
|
| 19 |
+
setResult(null);
|
| 20 |
+
|
| 21 |
+
try {
|
| 22 |
+
const response = await axios.post('/api/analyze', { url });
|
| 23 |
+
setResult(response.data);
|
| 24 |
+
} catch (err) {
|
| 25 |
+
console.error(err);
|
| 26 |
+
setError("Analysis failed. Please check the URL or try again later.");
|
| 27 |
+
} finally {
|
| 28 |
+
setLoading(false);
|
| 29 |
+
}
|
| 30 |
+
};
|
| 31 |
+
|
| 32 |
+
return (
|
| 33 |
+
<div className="max-w-3xl mx-auto space-y-8">
|
| 34 |
+
{/* Input Section */}
|
| 35 |
+
<div className="bg-slate-900/50 p-8 rounded-2xl border border-slate-800 shadow-xl backdrop-blur-sm">
|
| 36 |
+
<h2 className="text-2xl font-bold mb-6 text-center text-slate-100">Analyze News Article</h2>
|
| 37 |
+
<form onSubmit={handleAnalyze} className="flex gap-2">
|
| 38 |
+
<div className="relative flex-1">
|
| 39 |
+
<Search className="absolute left-4 top-1/2 -translate-y-1/2 text-slate-500 w-5 h-5" />
|
| 40 |
+
<input
|
| 41 |
+
type="url"
|
| 42 |
+
placeholder="Paste article URL here (e.g., https://elpais.com/...)"
|
| 43 |
+
value={url}
|
| 44 |
+
onChange={(e) => setUrl(e.target.value)}
|
| 45 |
+
className="w-full bg-slate-950 border border-slate-700 text-slate-100 pl-11 pr-4 py-4 rounded-xl focus:outline-none focus:ring-2 focus:ring-blue-500/50 transition-all placeholder:text-slate-600"
|
| 46 |
+
required
|
| 47 |
+
/>
|
| 48 |
+
</div>
|
| 49 |
+
<button
|
| 50 |
+
type="submit"
|
| 51 |
+
disabled={loading}
|
| 52 |
+
className="bg-blue-600 hover:bg-blue-500 text-white px-8 rounded-xl font-medium transition-colors disabled:opacity-50 disabled:cursor-not-allowed flex items-center gap-2"
|
| 53 |
+
>
|
| 54 |
+
{loading ? <Loader2 className="animate-spin w-5 h-5" /> : "Analyze"}
|
| 55 |
+
</button>
|
| 56 |
+
</form>
|
| 57 |
+
</div>
|
| 58 |
+
|
| 59 |
+
{/* Error Message */}
|
| 60 |
+
{error && (
|
| 61 |
+
<motion.div
|
| 62 |
+
initial={{ opacity: 0, y: 10 }}
|
| 63 |
+
animate={{ opacity: 1, y: 0 }}
|
| 64 |
+
className="p-4 bg-red-950/30 border border-red-900/50 rounded-xl text-red-400 text-center"
|
| 65 |
+
>
|
| 66 |
+
{error}
|
| 67 |
+
</motion.div>
|
| 68 |
+
)}
|
| 69 |
+
|
| 70 |
+
{/* Results Section */}
|
| 71 |
+
{result && (
|
| 72 |
+
<motion.div
|
| 73 |
+
initial={{ opacity: 0, scale: 0.95 }}
|
| 74 |
+
animate={{ opacity: 1, scale: 1 }}
|
| 75 |
+
className="bg-slate-900/50 rounded-2xl border border-slate-800 overflow-hidden shadow-2xl"
|
| 76 |
+
>
|
| 77 |
+
{/* Header */}
|
| 78 |
+
<div className="p-6 border-b border-slate-800 bg-slate-900/80">
|
| 79 |
+
<h3 className="text-xl font-semibold text-slate-100 leading-snug">{result.title}</h3>
|
| 80 |
+
<p className="mt-2 text-slate-400 text-sm line-clamp-3 leading-relaxed">
|
| 81 |
+
{result.text_snippet}
|
| 82 |
+
</p>
|
| 83 |
+
</div>
|
| 84 |
+
|
| 85 |
+
{/* Analysis Cards */}
|
| 86 |
+
<div className="grid grid-cols-1 md:grid-cols-2 gap-px bg-slate-800">
|
| 87 |
+
{/* Clickbait Result */}
|
| 88 |
+
<div className={cn(
|
| 89 |
+
"p-8 flex flex-col items-center justify-center text-center gap-4",
|
| 90 |
+
result.is_clickbait ? "bg-red-950/20" : "bg-emerald-950/20"
|
| 91 |
+
)}>
|
| 92 |
+
<div className={cn(
|
| 93 |
+
"w-16 h-16 rounded-full flex items-center justify-center mb-2 shadow-lg",
|
| 94 |
+
result.is_clickbait ? "bg-orange-500/20 text-orange-500" : "bg-emerald-500/20 text-emerald-500"
|
| 95 |
+
)}>
|
| 96 |
+
{result.is_clickbait ? <AlertTriangle className="w-8 h-8" /> : <CheckCircle className="w-8 h-8" />}
|
| 97 |
+
</div>
|
| 98 |
+
<div>
|
| 99 |
+
<h4 className="text-slate-400 text-sm font-medium uppercase tracking-widest mb-1">Headline Analysis</h4>
|
| 100 |
+
<p className={cn(
|
| 101 |
+
"text-2xl font-bold",
|
| 102 |
+
result.is_clickbait ? "text-orange-400" : "text-emerald-400"
|
| 103 |
+
)}>
|
| 104 |
+
{result.is_clickbait ? "Clickbait Detected" : "No Clickbait"}
|
| 105 |
+
</p>
|
| 106 |
+
</div>
|
| 107 |
+
<div className="text-xs text-slate-500 bg-slate-900/50 px-3 py-1 rounded-full">
|
| 108 |
+
Confidence: {(result.clickbait_conf * 100).toFixed(1)}%
|
| 109 |
+
</div>
|
| 110 |
+
</div>
|
| 111 |
+
|
| 112 |
+
{/* Sensationalism Result */}
|
| 113 |
+
<div className={cn(
|
| 114 |
+
"p-8 flex flex-col items-center justify-center text-center gap-4",
|
| 115 |
+
result.is_sensationalist ? "bg-red-950/20" : "bg-emerald-950/20"
|
| 116 |
+
)}>
|
| 117 |
+
<div className={cn(
|
| 118 |
+
"w-16 h-16 rounded-full flex items-center justify-center mb-2 shadow-lg",
|
| 119 |
+
result.is_sensationalist ? "bg-red-500/20 text-red-500" : "bg-blue-500/20 text-blue-500"
|
| 120 |
+
)}>
|
| 121 |
+
{result.is_sensationalist ? <AlertOctagon className="w-8 h-8" /> : <CheckCircle className="w-8 h-8" />}
|
| 122 |
+
</div>
|
| 123 |
+
<div>
|
| 124 |
+
<h4 className="text-slate-400 text-sm font-medium uppercase tracking-widest mb-1">Tone Analysis</h4>
|
| 125 |
+
<p className={cn(
|
| 126 |
+
"text-2xl font-bold",
|
| 127 |
+
result.is_sensationalist ? "text-red-400" : "text-blue-400"
|
| 128 |
+
)}>
|
| 129 |
+
{result.is_sensationalist ? "Sensationalist" : "Neutral Tone"}
|
| 130 |
+
</p>
|
| 131 |
+
</div>
|
| 132 |
+
<div className="text-xs text-slate-500 bg-slate-900/50 px-3 py-1 rounded-full">
|
| 133 |
+
Confidence: {(result.sensationalist_conf * 100).toFixed(1)}%
|
| 134 |
+
</div>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
</motion.div>
|
| 138 |
+
)}
|
| 139 |
+
</div>
|
| 140 |
+
);
|
| 141 |
+
}
|
frontend/src/components/ClusterCard.jsx
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from 'react';
|
| 2 |
+
import { motion, AnimatePresence } from 'framer-motion';
|
| 3 |
+
import { ChevronDown, ExternalLink, Hash, FileText } from 'lucide-react';
|
| 4 |
+
import { cn } from '../lib/utils';
|
| 5 |
+
|
| 6 |
+
export default function ClusterCard({ cluster }) {
|
| 7 |
+
const [isExpanded, setIsExpanded] = useState(false);
|
| 8 |
+
|
| 9 |
+
return (
|
| 10 |
+
<motion.div
|
| 11 |
+
layout
|
| 12 |
+
className="bg-slate-900/50 border border-slate-800 rounded-xl overflow-hidden hover:border-slate-700 transition-colors"
|
| 13 |
+
>
|
| 14 |
+
<div
|
| 15 |
+
onClick={() => setIsExpanded(!isExpanded)}
|
| 16 |
+
className="p-5 cursor-pointer flex items-start justify-between gap-4"
|
| 17 |
+
>
|
| 18 |
+
<div className="flex-1">
|
| 19 |
+
<div className="flex items-center gap-2 mb-2">
|
| 20 |
+
<span className="bg-blue-500/10 text-blue-400 text-xs px-2 py-0.5 rounded-full border border-blue-500/20 font-medium flex items-center gap-1">
|
| 21 |
+
<Hash className="w-3 h-3" />
|
| 22 |
+
Cluster {cluster.cluster_id}
|
| 23 |
+
</span>
|
| 24 |
+
<span className="bg-slate-800 text-slate-400 text-xs px-2 py-0.5 rounded-full font-medium flex items-center gap-1">
|
| 25 |
+
<FileText className="w-3 h-3" />
|
| 26 |
+
{cluster.size} articles
|
| 27 |
+
</span>
|
| 28 |
+
</div>
|
| 29 |
+
<h3 className="text-lg font-semibold text-slate-200 leading-tight group-hover:text-blue-400 transition-colors">
|
| 30 |
+
{cluster.title}
|
| 31 |
+
</h3>
|
| 32 |
+
</div>
|
| 33 |
+
<motion.div
|
| 34 |
+
animate={{ rotate: isExpanded ? 180 : 0 }}
|
| 35 |
+
transition={{ duration: 0.2 }}
|
| 36 |
+
className="bg-slate-800 p-1.5 rounded-lg text-slate-400"
|
| 37 |
+
>
|
| 38 |
+
<ChevronDown className="w-5 h-5" />
|
| 39 |
+
</motion.div>
|
| 40 |
+
</div>
|
| 41 |
+
|
| 42 |
+
<AnimatePresence>
|
| 43 |
+
{isExpanded && (
|
| 44 |
+
<motion.div
|
| 45 |
+
initial={{ height: 0, opacity: 0 }}
|
| 46 |
+
animate={{ height: "auto", opacity: 1 }}
|
| 47 |
+
exit={{ height: 0, opacity: 0 }}
|
| 48 |
+
className="border-t border-slate-800 bg-slate-900/30"
|
| 49 |
+
>
|
| 50 |
+
<div className="p-4 space-y-3">
|
| 51 |
+
{cluster.articles.map((article, idx) => (
|
| 52 |
+
<a
|
| 53 |
+
key={idx}
|
| 54 |
+
href={article.url}
|
| 55 |
+
target="_blank"
|
| 56 |
+
rel="noopener noreferrer"
|
| 57 |
+
className="block p-3 rounded-lg bg-slate-800/40 hover:bg-slate-800 transition-colors group"
|
| 58 |
+
>
|
| 59 |
+
<div className="flex justify-between items-start gap-3">
|
| 60 |
+
<h4 className="text-sm text-slate-300 font-medium line-clamp-2 group-hover:text-blue-300">
|
| 61 |
+
{article.title}
|
| 62 |
+
</h4>
|
| 63 |
+
<ExternalLink className="w-4 h-4 text-slate-500 shrink-0 group-hover:text-blue-400" />
|
| 64 |
+
</div>
|
| 65 |
+
<div className="mt-2 flex items-center gap-2 text-xs text-slate-500">
|
| 66 |
+
<span className="font-semibold text-slate-400">{article.newspaper}</span>
|
| 67 |
+
<span>•</span>
|
| 68 |
+
<span className="truncate max-w-[200px]">{article.newspaper_url}</span>
|
| 69 |
+
</div>
|
| 70 |
+
</a>
|
| 71 |
+
))}
|
| 72 |
+
</div>
|
| 73 |
+
</motion.div>
|
| 74 |
+
)}
|
| 75 |
+
</AnimatePresence>
|
| 76 |
+
</motion.div>
|
| 77 |
+
);
|
| 78 |
+
}
|
frontend/src/index.css
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@tailwind base;
|
| 2 |
+
@tailwind components;
|
| 3 |
+
@tailwind utilities;
|
| 4 |
+
|
| 5 |
+
@layer base {
|
| 6 |
+
body {
|
| 7 |
+
@apply bg-slate-950 text-slate-100;
|
| 8 |
+
}
|
| 9 |
+
}
|
frontend/src/lib/utils.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { clsx } from "clsx"
|
| 2 |
+
import { twMerge } from "tailwind-merge"
|
| 3 |
+
|
| 4 |
+
export function cn(...inputs) {
|
| 5 |
+
return twMerge(clsx(inputs))
|
| 6 |
+
}
|
frontend/src/main.jsx
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import React from 'react'
|
| 2 |
+
import ReactDOM from 'react-dom/client'
|
| 3 |
+
import App from './App.jsx'
|
| 4 |
+
import './index.css'
|
| 5 |
+
|
| 6 |
+
ReactDOM.createRoot(document.getElementById('root')).render(
|
| 7 |
+
<React.StrictMode>
|
| 8 |
+
<App />
|
| 9 |
+
</React.StrictMode>,
|
| 10 |
+
)
|
frontend/tailwind.config.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/** @type {import('tailwindcss').Config} */
|
| 2 |
+
export default {
|
| 3 |
+
content: [
|
| 4 |
+
"./index.html",
|
| 5 |
+
"./src/**/*.{js,ts,jsx,tsx}",
|
| 6 |
+
],
|
| 7 |
+
theme: {
|
| 8 |
+
extend: {
|
| 9 |
+
fontFamily: {
|
| 10 |
+
sans: ['Inter', 'sans-serif'],
|
| 11 |
+
},
|
| 12 |
+
colors: {
|
| 13 |
+
primary: "#2563EB",
|
| 14 |
+
secondary: "#475569",
|
| 15 |
+
accent: "#F59E0B",
|
| 16 |
+
background: "#0F172A",
|
| 17 |
+
surface: "#1E293B",
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
},
|
| 21 |
+
plugins: [],
|
| 22 |
+
}
|
frontend/vite.config.js
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from 'vite'
|
| 2 |
+
import react from '@vitejs/plugin-react'
|
| 3 |
+
import path from 'path'
|
| 4 |
+
|
| 5 |
+
// https://vitejs.dev/config/
|
| 6 |
+
export default defineConfig({
|
| 7 |
+
plugins: [react()],
|
| 8 |
+
resolve: {
|
| 9 |
+
alias: {
|
| 10 |
+
'@': path.resolve(__dirname, './src'),
|
| 11 |
+
},
|
| 12 |
+
},
|
| 13 |
+
server: {
|
| 14 |
+
proxy: {
|
| 15 |
+
'/api': {
|
| 16 |
+
target: 'http://localhost:8000',
|
| 17 |
+
changeOrigin: true,
|
| 18 |
+
},
|
| 19 |
+
},
|
| 20 |
+
},
|
| 21 |
+
})
|
main.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from fastapi import FastAPI, HTTPException
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
from fastapi.staticfiles import StaticFiles
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 9 |
+
import torch
|
| 10 |
+
import newspaper
|
| 11 |
+
from newspaper import Article
|
| 12 |
+
import logging
|
| 13 |
+
import nltk
|
| 14 |
+
|
| 15 |
+
# Configure logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
app = FastAPI()
|
| 20 |
+
|
| 21 |
+
# CORS configuration
|
| 22 |
+
app.add_middleware(
|
| 23 |
+
CORSMiddleware,
|
| 24 |
+
allow_origins=["*"],
|
| 25 |
+
allow_credentials=True,
|
| 26 |
+
allow_methods=["*"],
|
| 27 |
+
allow_headers=["*"],
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Global variables
|
| 31 |
+
DATA_PATH = "src/dataset.parquet"
|
| 32 |
+
df = None
|
| 33 |
+
models = {}
|
| 34 |
+
|
| 35 |
+
# Load Data
|
| 36 |
+
def load_data():
|
| 37 |
+
global df
|
| 38 |
+
try:
|
| 39 |
+
logger.info(f"Loading data from {DATA_PATH}...")
|
| 40 |
+
df = pd.read_parquet(DATA_PATH)
|
| 41 |
+
# Ensure date is string or datetime for consistent handling
|
| 42 |
+
df['date'] = df['date'].astype(str)
|
| 43 |
+
logger.info("Data loaded successfully.")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Error loading data: {e}")
|
| 46 |
+
df = pd.DataFrame()
|
| 47 |
+
|
| 48 |
+
# Load Models
|
| 49 |
+
def load_models():
|
| 50 |
+
# Only load if not already loaded to save resources on reload (though in docker it's once)
|
| 51 |
+
if "clickbait" in models:
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
logger.info("Loading Clickbait Model...")
|
| 56 |
+
cb_model_name = "eriksarriegui/mmBERT-base-clickbait-detection-es"
|
| 57 |
+
models["cb_tokenizer"] = AutoTokenizer.from_pretrained(cb_model_name)
|
| 58 |
+
models["cb_model"] = AutoModelForSequenceClassification.from_pretrained(cb_model_name)
|
| 59 |
+
|
| 60 |
+
logger.info("Loading Sensationalism Model...")
|
| 61 |
+
sens_model_name = "eriksarriegui/mmBERT-base-sensacionalism-detection-es"
|
| 62 |
+
models["sens_tokenizer"] = AutoTokenizer.from_pretrained(sens_model_name)
|
| 63 |
+
models["sens_model"] = AutoModelForSequenceClassification.from_pretrained(sens_model_name)
|
| 64 |
+
logger.info("Models loaded successfully.")
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Error loading models: {e}")
|
| 67 |
+
|
| 68 |
+
@app.on_event("startup")
|
| 69 |
+
async def startup_event():
|
| 70 |
+
try:
|
| 71 |
+
nltk.download('punkt')
|
| 72 |
+
nltk.download('punkt_tab')
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.warning(f"NLTK download warning: {e}")
|
| 75 |
+
|
| 76 |
+
load_data()
|
| 77 |
+
# We might want to load models lazily if memory is tight, but for now let's load on startup
|
| 78 |
+
# or trigger it on first request. Let's trigger on startup to fail fast.
|
| 79 |
+
# Note: Downloading models takes time.
|
| 80 |
+
load_models()
|
| 81 |
+
|
| 82 |
+
# Pydantic Models
|
| 83 |
+
class AnalyzeRequest(BaseModel):
|
| 84 |
+
url: str
|
| 85 |
+
|
| 86 |
+
class ClusterResponse(BaseModel):
|
| 87 |
+
date: str
|
| 88 |
+
clusters: List[Dict[str, Any]]
|
| 89 |
+
|
| 90 |
+
# Endpoints
|
| 91 |
+
|
| 92 |
+
@app.get("/api/clusters")
|
| 93 |
+
def get_clusters():
|
| 94 |
+
if df is None or df.empty:
|
| 95 |
+
raise HTTPException(status_code=500, detail="Data not loaded")
|
| 96 |
+
|
| 97 |
+
# Group by date and count clusters
|
| 98 |
+
# Assuming 'date' is YYYY-MM-DD
|
| 99 |
+
# cluster_id uniquely identifies a cluster.
|
| 100 |
+
|
| 101 |
+
# 1. Count clusters per day
|
| 102 |
+
# We count unique cluster_ids per date
|
| 103 |
+
daily_counts = df.groupby('date')['cluster_id'].nunique().reset_index()
|
| 104 |
+
daily_counts.columns = ['date', 'count']
|
| 105 |
+
|
| 106 |
+
# 2. Get top 3 days
|
| 107 |
+
top_days = daily_counts.sort_values('count', ascending=False).head(3)['date'].tolist()
|
| 108 |
+
|
| 109 |
+
result = []
|
| 110 |
+
|
| 111 |
+
for day in top_days:
|
| 112 |
+
day_data = df[df['date'] == day]
|
| 113 |
+
|
| 114 |
+
# Group by cluster_id
|
| 115 |
+
clusters_data = []
|
| 116 |
+
for cluster_id, group in day_data.groupby('cluster_id'):
|
| 117 |
+
# Pick a representative title (e.g., the first one or longest)
|
| 118 |
+
# We'll take the first one for simplicity, or maybe the one with most extracted_authors?
|
| 119 |
+
rep_article = group.iloc[0]
|
| 120 |
+
|
| 121 |
+
cluster_articles = []
|
| 122 |
+
for _, row in group.iterrows():
|
| 123 |
+
cluster_articles.append({
|
| 124 |
+
"title": row.get('title', 'No Title'),
|
| 125 |
+
"newspaper": row.get('newspaper', 'Unknown'),
|
| 126 |
+
"url": row.get('article_url', '#'),
|
| 127 |
+
"newspaper_url": row.get('newspaper_url', '#')
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
clusters_data.append({
|
| 131 |
+
"cluster_id": str(cluster_id),
|
| 132 |
+
"title": rep_article.get('title', 'Cluster Title'),
|
| 133 |
+
"size": int(group['cluster_size'].iloc[0]) if 'cluster_size' in group else len(group),
|
| 134 |
+
"articles": cluster_articles
|
| 135 |
+
})
|
| 136 |
+
|
| 137 |
+
result.append({
|
| 138 |
+
"date": day,
|
| 139 |
+
"clusters": clusters_data
|
| 140 |
+
})
|
| 141 |
+
|
| 142 |
+
return result
|
| 143 |
+
|
| 144 |
+
@app.post("/api/analyze")
|
| 145 |
+
def analyze_article(request: AnalyzeRequest):
|
| 146 |
+
url = request.url
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
# Scrape
|
| 150 |
+
article = Article(url)
|
| 151 |
+
article.download()
|
| 152 |
+
article.parse()
|
| 153 |
+
|
| 154 |
+
title = article.title
|
| 155 |
+
text = article.text
|
| 156 |
+
|
| 157 |
+
# Inference
|
| 158 |
+
# Clickbait: using title
|
| 159 |
+
cb_inputs = models["cb_tokenizer"](title, return_tensors="pt", truncation=True, max_length=512)
|
| 160 |
+
with torch.no_grad():
|
| 161 |
+
cb_outputs = models["cb_model"](**cb_inputs)
|
| 162 |
+
cb_probs = torch.softmax(cb_outputs.logits, dim=1)
|
| 163 |
+
# Assuming label 0 is NO, 1 is YES (need to verify mapping usually, usually 1 is positive class)
|
| 164 |
+
# Checking model config locally or on HF would be ideal. Let's assume standard mapping:
|
| 165 |
+
# For clickbait: likely labels are "no_clickbait", "clickbait".
|
| 166 |
+
# We can check specific model details if needed. For now assume:
|
| 167 |
+
# Label 1 is clickbait.
|
| 168 |
+
is_clickbait = torch.argmax(cb_probs).item() == 1
|
| 169 |
+
|
| 170 |
+
# Sensationalism: using title + text
|
| 171 |
+
sens_input_text = f"TITULAR: {title}\nCUERPO: {text}"
|
| 172 |
+
sens_inputs = models["sens_tokenizer"](sens_input_text, return_tensors="pt", truncation=True, max_length=512)
|
| 173 |
+
with torch.no_grad():
|
| 174 |
+
sens_outputs = models["sens_model"](**sens_inputs)
|
| 175 |
+
sens_probs = torch.softmax(sens_outputs.logits, dim=1)
|
| 176 |
+
is_sensationalist = torch.argmax(sens_probs).item() == 1
|
| 177 |
+
|
| 178 |
+
return {
|
| 179 |
+
"title": title,
|
| 180 |
+
"text_snippet": text[:200] + "..." if len(text) > 200 else text,
|
| 181 |
+
"is_clickbait": is_clickbait,
|
| 182 |
+
"is_sensationalist": is_sensationalist,
|
| 183 |
+
"clickbait_conf": float(cb_probs[0][1]),
|
| 184 |
+
"sensationalist_conf": float(sens_probs[0][1])
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Analysis failed: {e}")
|
| 189 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 190 |
+
|
| 191 |
+
# Static files for Frontend (will be populated after build)
|
| 192 |
+
app.mount("/", StaticFiles(directory="static", html=True), name="static")
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
import uvicorn
|
| 196 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
pandas
|
| 4 |
+
pyarrow
|
| 5 |
+
torch
|
| 6 |
+
transformers
|
| 7 |
+
newspaper3k
|
| 8 |
+
lxml[html_clean]
|
| 9 |
+
python-multipart
|
| 10 |
+
scipy
|
| 11 |
+
sklearn
|
| 12 |
+
emoji
|
src/dataset.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eda3135ba3230dfe801b6b4cd0ac7b127a95abee7be3dd4687192911c510c834
|
| 3 |
+
size 28200022
|