Spaces:

winbeau
/

paper-insight-api

Sleeping

App Files Files Community

winbeau commited on Jan 26

Commit

353b9f4

0 Parent(s):

Backend snapshot from 760fa78

Browse files

Files changed (20) hide show

.env.example +10 -0
.gitignore +13 -0
.python-version +1 -0
Dockerfile +23 -0
README.md +13 -0
app/__init__.py +0 -0
app/constants.py +8 -0
app/database.py +130 -0
app/main.py +365 -0
app/models.py +82 -0
app/services/__init__.py +0 -0
app/services/arxiv_bot.py +275 -0
app/services/dify_client.py +341 -0
app/services/llm_brain.py +105 -0
app/services/pdf_renderer.py +81 -0
backfill_thumbnails.py +48 -0
main.py +65 -0
paper_insight.db +0 -0
pyproject.toml +19 -0
uv.lock +0 -0

.env.example ADDED Viewed

	@@ -0,0 +1,10 @@

+DATABASE_URL=postgresql://USER:PASSWORD@HOST:PORT/DATABASE?sslmode=require
+# Dify Workflow API Configuration
+DIFY_API_KEY=your_dify_api_key_here
+DIFY_API_BASE=http://82.157.209.193:8080/v1
+# Legacy DeepSeek Configuration (deprecated, use Dify instead)
+# DEEPSEEK_API_KEY=your_api_key_here
+# DEEPSEEK_BASE_URL=https://api.deepseek.com
+# DEEPSEEK_MODEL=deepseek-chat

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Environment variables
+.env

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    UV_CACHE_DIR=/tmp/uv-cache \
+    PATH="/root/.local/bin:/root/.cargo/bin:$PATH"
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends gcc libpq-dev curl \
+    && rm -rf /var/lib/apt/lists/*
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+WORKDIR /app
+COPY pyproject.toml uv.lock ./
+RUN uv sync --frozen --no-dev
+COPY . .
+EXPOSE 7860
+CMD ["uv", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Paper Insight API
+emoji: "🎉"
+colorFrom: "blue"
+colorTo: "green"
+sdk: "docker"
+app_file: "Dockerfile"
+pinned: false
+---
+# Paper Insight API
+FastAPI backend for the Paper Insight project.

app/__init__.py ADDED Viewed

File without changes

app/constants.py ADDED Viewed

	@@ -0,0 +1,8 @@

+ARXIV_OPTIONS = [
+    {"code": "cs.CV", "name": "Computer Vision", "desc": "Image processing, generated models, segmentation"},
+    {"code": "cs.CL", "name": "Computation and Language", "desc": "NLP, LLMs, Text mining"},
+    {"code": "cs.LG", "name": "Machine Learning", "desc": "Deep learning architectures, optimization, algorithms"},
+    {"code": "cs.AI", "name": "Artificial Intelligence", "desc": "General AI, reasoning, cognitive modeling"},
+    {"code": "cs.RO", "name": "Robotics", "desc": "Kinematics, dynamics, sensors, control"},
+    {"code": "cs.SD", "name": "Sound", "desc": "Audio processing, speech recognition"}
+]

app/database.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+from typing import Generator
+from sqlmodel import SQLModel, create_engine, Session
+from sqlalchemy import inspect, text
+from dotenv import load_dotenv
+load_dotenv()
+DATABASE_URL = os.getenv(
+    "DATABASE_URL",
+    "postgresql://postgres:postgres@localhost:5432/paper_insight"
+)
+engine = create_engine(DATABASE_URL, echo=False)
+def create_db_and_tables():
+    """Create all database tables."""
+    SQLModel.metadata.create_all(engine)
+def ensure_appsettings_schema():
+    """Ensure AppSettings has expected columns for legacy databases."""
+    inspector = inspect(engine)
+    if "appsettings" not in inspector.get_table_names():
+        return
+    columns = {col["name"] for col in inspector.get_columns("appsettings")}
+    added = set()
+    ddl_statements = []
+    if "research_focus" not in columns:
+        ddl_statements.append("ALTER TABLE appsettings ADD COLUMN research_focus TEXT")
+        added.add("research_focus")
+    if "focus_keywords" not in columns:
+        ddl_statements.append("ALTER TABLE appsettings ADD COLUMN focus_keywords JSON")
+        added.add("focus_keywords")
+    if "system_prompt" not in columns:
+        ddl_statements.append("ALTER TABLE appsettings ADD COLUMN system_prompt TEXT")
+        added.add("system_prompt")
+    if "arxiv_categories" not in columns:
+        ddl_statements.append("ALTER TABLE appsettings ADD COLUMN arxiv_categories JSON")
+        added.add("arxiv_categories")
+    if not ddl_statements and not columns:
+        return
+    final_columns = columns | added
+    with engine.begin() as conn:
+        for stmt in ddl_statements:
+            conn.execute(text(stmt))
+        if "research_focus" in final_columns:
+            conn.execute(
+                text("UPDATE appsettings SET research_focus = '' WHERE research_focus IS NULL")
+            )
+        if "system_prompt" in final_columns:
+            conn.execute(
+                text("UPDATE appsettings SET system_prompt = '' WHERE system_prompt IS NULL")
+            )
+        if "focus_keywords" in final_columns:
+            conn.execute(
+                text("UPDATE appsettings SET focus_keywords = '[]' WHERE focus_keywords IS NULL")
+            )
+        if "arxiv_categories" in final_columns:
+            conn.execute(
+                text(
+                    "UPDATE appsettings SET arxiv_categories = "
+                    "'[\"cs.CV\",\"cs.LG\"]' WHERE arxiv_categories IS NULL"
+                )
+            )
+def ensure_paper_schema():
+    """Ensure Paper has expected columns for legacy databases."""
+    inspector = inspect(engine)
+    table_name = None
+    if "paper" in inspector.get_table_names():
+        table_name = "paper"
+    elif "papers" in inspector.get_table_names():
+        table_name = "papers"
+    if not table_name:
+        return
+    columns = {col["name"] for col in inspector.get_columns(table_name)}
+    added = set()
+    ddl_statements = []
+    if "processing_status" not in columns:
+        ddl_statements.append(
+            f"ALTER TABLE {table_name} ADD COLUMN processing_status TEXT"
+        )
+        added.add("processing_status")
+    final_columns = columns | added
+    with engine.begin() as conn:
+        for stmt in ddl_statements:
+            conn.execute(text(stmt))
+        if "processing_status" in final_columns:
+            conn.execute(
+                text(
+                    f"UPDATE {table_name} "
+                    "SET processing_status = CASE "
+                    "WHEN is_processed THEN 'processed' ELSE 'pending' END "
+                    "WHERE processing_status IS NULL"
+                )
+            )
+            conn.execute(
+                text(
+                    f"UPDATE {table_name} "
+                    "SET processing_status = 'skipped' "
+                    "WHERE is_processed = TRUE "
+                    "AND relevance_score IS NOT NULL "
+                    "AND relevance_score < 5 "
+                    "AND processing_status = 'processed'"
+                )
+            )
+def get_session() -> Generator[Session, None, None]:
+    """Dependency for getting database session."""
+    with Session(engine) as session:
+        yield session
+def get_sync_session() -> Session:
+    """Get a synchronous session for non-FastAPI contexts."""
+    return Session(engine)

app/main.py ADDED Viewed

	@@ -0,0 +1,365 @@

+from contextlib import asynccontextmanager
+from typing import List, Optional
+import re
+import json
+from fastapi import FastAPI, Depends, HTTPException, Query, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import StreamingResponse
+from pathlib import Path
+from sqlmodel import Session, select
+from sqlalchemy import or_
+from app.database import create_db_and_tables, ensure_appsettings_schema, ensure_paper_schema, get_session
+from app.models import Paper, PaperRead, AppSettings
+from app.services.arxiv_bot import get_arxiv_bot, run_daily_fetch
+from app.services.dify_client import (
+    get_dify_client,
+    DifyClientError,
+    DifyEntityTooLargeError,
+    DifyTimeoutError,
+    DifyRateLimitError,
+)
+from app.constants import ARXIV_OPTIONS
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    create_db_and_tables()
+    ensure_appsettings_schema()
+    ensure_paper_schema()
+    yield
+app = FastAPI(
+    title="Paper Insight API",
+    description="API for fetching and summarizing arXiv papers focused on Autoregressive DiT and KV Cache Compression",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Mount static files
+static_path = Path(__file__).parent / "static"
+static_path.mkdir(exist_ok=True)
+app.mount("/static", StaticFiles(directory=static_path), name="static")
+@app.get("/health")
+def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+@app.get("/constants")
+def get_constants():
+    """Get application constants."""
+    return {"arxiv_options": ARXIV_OPTIONS}
+@app.get("/settings", response_model=AppSettings)
+def get_settings(session: Session = Depends(get_session)):
+    """Get application settings."""
+    settings = session.get(AppSettings, 1)
+    if not settings:
+        settings = AppSettings(id=1)
+        session.add(settings)
+        session.commit()
+        session.refresh(settings)
+    return settings
+@app.put("/settings", response_model=AppSettings)
+def update_settings(new_settings: AppSettings, session: Session = Depends(get_session)):
+    """Update application settings."""
+    settings = session.get(AppSettings, 1)
+    if not settings:
+        settings = AppSettings(id=1)
+        session.add(settings)
+    settings.research_focus = new_settings.research_focus
+    settings.system_prompt = new_settings.system_prompt
+    settings.arxiv_categories = new_settings.arxiv_categories
+    # Parse focus keywords
+    if new_settings.research_focus:
+        raw_focus = new_settings.research_focus.strip()
+        if ";" in raw_focus:
+            keywords = [
+                k.strip() for k in re.split(r"[;]+", raw_focus)
+                if k.strip()
+            ]
+        else:
+            parts = re.split(r"\bOR\b|\bAND\b", raw_focus, flags=re.IGNORECASE)
+            keywords = []
+            for part in parts:
+                cleaned = part.strip()
+                if not cleaned:
+                    continue
+                cleaned = re.sub(r"^[()]+|[()]+$", "", cleaned).strip()
+                cleaned = re.sub(r"^(?:all|abs|ti):", "", cleaned, flags=re.IGNORECASE).strip()
+                cleaned = cleaned.strip('"').strip()
+                if cleaned:
+                    keywords.append(cleaned)
+            seen = set()
+            deduped = []
+            for keyword in keywords:
+                if keyword not in seen:
+                    deduped.append(keyword)
+                    seen.add(keyword)
+            keywords = deduped
+        settings.focus_keywords = keywords
+    else:
+        settings.focus_keywords = []
+    session.add(settings)
+    session.commit()
+    session.refresh(settings)
+    return settings
+@app.get("/papers", response_model=List[PaperRead])
+def get_papers(
+    session: Session = Depends(get_session),
+    skip: int = Query(0, ge=0),
+    limit: int = Query(20, ge=1, le=100),
+    min_score: Optional[float] = Query(None, ge=0, le=10),
+    processed_only: bool = Query(False),
+):
+    """Get papers with optional filtering."""
+    query = select(Paper).where(
+        or_(Paper.processing_status.is_(None), Paper.processing_status != "skipped")
+    )
+    if processed_only:
+        query = query.where(Paper.is_processed == True)
+    if min_score is not None:
+        query = query.where(Paper.relevance_score >= min_score)
+    query = query.order_by(
+        Paper.is_processed.desc(),
+        Paper.relevance_score.desc().nulls_last(),
+        Paper.published.desc()
+    ).offset(skip).limit(limit)
+    papers = session.exec(query).all()
+    return papers
+@app.get("/papers/{paper_id}", response_model=PaperRead)
+def get_paper(paper_id: int, session: Session = Depends(get_session)):
+    """Get a specific paper by ID."""
+    paper = session.get(Paper, paper_id)
+    if not paper:
+        raise HTTPException(status_code=404, detail="Paper not found")
+    return paper
+@app.get("/papers/arxiv/{arxiv_id}", response_model=PaperRead)
+def get_paper_by_arxiv_id(arxiv_id: str, session: Session = Depends(get_session)):
+    """Get a specific paper by arXiv ID."""
+    paper = session.exec(select(Paper).where(Paper.arxiv_id == arxiv_id)).first()
+    if not paper:
+        raise HTTPException(status_code=404, detail="Paper not found")
+    return paper
+@app.post("/papers/fetch")
+def fetch_papers(
+    background_tasks: BackgroundTasks,
+    session: Session = Depends(get_session),
+):
+    """Trigger paper fetching in the background."""
+    background_tasks.add_task(run_daily_fetch)
+    return {"message": "Paper fetch started in background"}
+@app.post("/papers/{paper_id}/process")
+async def process_paper(paper_id: int, session: Session = Depends(get_session)):
+    """Process a specific paper with LLM analysis."""
+    paper = session.get(Paper, paper_id)
+    if not paper:
+        raise HTTPException(status_code=404, detail="Paper not found")
+    if paper.is_processed:
+        return {"message": "Paper already processed", "paper_id": paper_id}
+    bot = get_arxiv_bot()
+    success = await bot.process_paper(session, paper)
+    if success:
+        return {"message": "Paper processed successfully", "paper_id": paper_id}
+    else:
+        raise HTTPException(status_code=500, detail="Failed to process paper")
+@app.get("/papers/{paper_id}/process/stream")
+async def process_paper_stream(paper_id: int, session: Session = Depends(get_session)):
+    """
+    Process a paper with streaming response for real-time updates.
+    Returns Server-Sent Events (SSE) with the following event types:
+    - thinking: R1 reasoning process (thought field)
+    - answer: Partial answer content
+    - progress: Processing progress updates
+    - result: Final structured analysis result
+    - error: Error information
+    - done: Stream completion signal
+    """
+    paper = session.get(Paper, paper_id)
+    if not paper:
+        raise HTTPException(status_code=404, detail="Paper not found")
+    async def generate_events():
+        """Generate SSE events for paper analysis."""
+        try:
+            # Update paper status
+            paper.processing_status = "processing"
+            session.add(paper)
+            session.commit()
+            # Send initial progress event
+            yield f"event: progress\ndata: {json.dumps({'status': 'started', 'message': '开始分析论文...'})}\n\n"
+            dify_client = get_dify_client()
+            thought_parts = []
+            answer_parts = []
+            final_outputs = None
+            async for event in dify_client.analyze_paper_stream(
+                paper.title,
+                paper.abstract,
+                user_id=f"paper-{paper_id}",
+            ):
+                # Handle thought (R1 thinking process)
+                if event.thought:
+                    thought_parts.append(event.thought)
+                    yield f"event: thinking\ndata: {json.dumps({'thought': event.thought})}\n\n"
+                # Handle answer chunks
+                if event.answer:
+                    answer_parts.append(event.answer)
+                    yield f"event: answer\ndata: {json.dumps({'answer': event.answer})}\n\n"
+                # Handle workflow events
+                if event.event == "workflow_started":
+                    yield f"event: progress\ndata: {json.dumps({'status': 'workflow_started', 'message': 'Dify工作流已启动'})}\n\n"
+                elif event.event == "node_started":
+                    node_title = event.data.get("data", {}).get("title", "")
+                    if node_title:
+                        yield f"event: progress\ndata: {json.dumps({'status': 'node_started', 'message': f'执行节点: {node_title}'})}\n\n"
+                elif event.event == "workflow_finished":
+                    if event.outputs:
+                        final_outputs = event.outputs
+            # Process final result
+            if final_outputs:
+                result = dify_client._parse_outputs(final_outputs, "".join(thought_parts))
+            elif answer_parts:
+                result = dify_client._parse_answer("".join(answer_parts), "".join(thought_parts))
+            else:
+                raise DifyClientError("No output received from Dify workflow")
+            # Convert to LLMAnalysis for database storage
+            analysis = dify_client.to_llm_analysis(result)
+            # Update paper with results
+            from datetime import datetime
+            paper.summary_zh = analysis.summary_zh
+            paper.relevance_score = analysis.relevance_score
+            paper.relevance_reason = analysis.relevance_reason
+            paper.heuristic_idea = analysis.heuristic_idea
+            paper.is_processed = True
+            paper.processed_at = datetime.utcnow()
+            if analysis.relevance_score >= 5:
+                paper.processing_status = "processed"
+            else:
+                paper.processing_status = "skipped"
+            session.add(paper)
+            session.commit()
+            # Send final result
+            result_data = {
+                "summary_zh": result.summary_zh,
+                "relevance_score": result.relevance_score,
+                "relevance_reason": result.relevance_reason,
+                "technical_mapping": {
+                    "token_vs_patch": result.technical_mapping.token_vs_patch,
+                    "temporal_logic": result.technical_mapping.temporal_logic,
+                    "frequency_domain": result.technical_mapping.frequency_domain,
+                },
+                "heuristic_idea": result.heuristic_idea,
+                "thought_process": result.thought_process,
+            }
+            yield f"event: result\ndata: {json.dumps(result_data, ensure_ascii=False)}\n\n"
+            yield f"event: done\ndata: {json.dumps({'status': 'completed'})}\n\n"
+        except DifyEntityTooLargeError as e:
+            paper.processing_status = "failed"
+            session.add(paper)
+            session.commit()
+            yield f"event: error\ndata: {json.dumps({'error': 'entity_too_large', 'message': str(e)})}\n\n"
+        except DifyTimeoutError as e:
+            paper.processing_status = "failed"
+            session.add(paper)
+            session.commit()
+            yield f"event: error\ndata: {json.dumps({'error': 'timeout', 'message': str(e)})}\n\n"
+        except DifyRateLimitError as e:
+            paper.processing_status = "failed"
+            session.add(paper)
+            session.commit()
+            yield f"event: error\ndata: {json.dumps({'error': 'rate_limit', 'message': str(e)})}\n\n"
+        except DifyClientError as e:
+            paper.processing_status = "failed"
+            session.add(paper)
+            session.commit()
+            yield f"event: error\ndata: {json.dumps({'error': 'dify_error', 'message': str(e)})}\n\n"
+        except Exception as e:
+            paper.processing_status = "failed"
+            session.add(paper)
+            session.commit()
+            yield f"event: error\ndata: {json.dumps({'error': 'unknown', 'message': str(e)})}\n\n"
+    return StreamingResponse(
+        generate_events(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",  # Disable nginx buffering
+        },
+    )
+@app.get("/stats")
+def get_stats(session: Session = Depends(get_session)):
+    """Get statistics about papers."""
+    total = session.exec(
+        select(Paper).where(
+            or_(Paper.processing_status.is_(None), Paper.processing_status != "skipped")
+        )
+    ).all()
+    processed = [p for p in total if p.is_processed]
+    high_relevance = [p for p in processed if p.relevance_score and p.relevance_score >= 9]
+    return {
+        "total_papers": len(total),
+        "processed_papers": len(processed),
+        "high_relevance_papers": len(high_relevance),
+        "pending_processing": len(total) - len(processed),
+    }

app/models.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from datetime import datetime
+from typing import Optional, List
+from sqlmodel import SQLModel, Field, Column, Text
+from sqlalchemy import JSON
+from pydantic import BaseModel
+class Paper(SQLModel, table=True):
+    """Paper model for storing arXiv papers with AI analysis."""
+    id: Optional[int] = Field(default=None, primary_key=True)
+    arxiv_id: str = Field(unique=True, index=True)
+    title: str
+    authors: str
+    abstract: str = Field(sa_column=Column(Text))
+    categories: str
+    published: datetime
+    updated: datetime
+    pdf_url: str
+    thumbnail_url: Optional[str] = None
+    # AI-generated analysis fields
+    summary_zh: Optional[str] = Field(default=None, sa_column=Column(Text))
+    relevance_score: Optional[float] = Field(default=None, ge=0, le=10)
+    relevance_reason: Optional[str] = Field(default=None, sa_column=Column(Text))
+    heuristic_idea: Optional[str] = Field(default=None, sa_column=Column(Text))
+    # Metadata
+    is_processed: bool = Field(default=False)
+    processing_status: str = Field(default="pending", index=True)
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    processed_at: Optional[datetime] = None
+class PaperCreate(SQLModel):
+    """Schema for creating a new paper."""
+    arxiv_id: str
+    title: str
+    authors: str
+    abstract: str
+    categories: str
+    published: datetime
+    updated: datetime
+    pdf_url: str
+class PaperRead(SQLModel):
+    """Schema for reading paper data."""
+    id: int
+    arxiv_id: str
+    title: str
+    authors: str
+    abstract: str
+    categories: str
+    published: datetime
+    updated: datetime
+    pdf_url: str
+    thumbnail_url: Optional[str] = None
+    summary_zh: Optional[str]
+    relevance_score: Optional[float]
+    relevance_reason: Optional[str]
+    heuristic_idea: Optional[str]
+    is_processed: bool
+    processing_status: str
+    created_at: datetime
+    processed_at: Optional[datetime]
+class LLMAnalysis(BaseModel):
+    """Schema for LLM analysis response."""
+    summary_zh: str
+    relevance_score: float
+    relevance_reason: str
+    heuristic_idea: str
+class AppSettings(SQLModel, table=True):
+    """Application settings stored in DB."""
+    id: int = Field(default=1, primary_key=True)
+    research_focus: str = Field(sa_column=Column(Text, default=""))
+    focus_keywords: List[str] = Field(default=[], sa_column=Column(JSON))
+    system_prompt: str = Field(sa_column=Column(Text, default=""))
+    arxiv_categories: List[str] = Field(sa_column=Column(JSON, default=["cs.CV", "cs.LG"]))

app/services/__init__.py ADDED Viewed

File without changes

app/services/arxiv_bot.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import asyncio
+import os
+import arxiv
+from datetime import datetime, timedelta, timezone
+from typing import List, Optional
+from sqlmodel import Session, select
+from app.models import Paper, PaperCreate, AppSettings
+from app.database import get_sync_session
+from app.services.pdf_renderer import generate_thumbnail
+def _get_analysis_client():
+    """
+    Get the appropriate analysis client based on configuration.
+    Prefers Dify if DIFY_API_KEY is set, otherwise falls back to LLMBrain.
+    """
+    if os.getenv("DIFY_API_KEY"):
+        from app.services.dify_client import get_dify_client
+        return get_dify_client(), "dify"
+    else:
+        from app.services.llm_brain import get_llm_brain
+        return get_llm_brain(), "deepseek"
+class ArxivBot:
+    """Bot for fetching and processing arXiv papers."""
+    def __init__(self):
+        self.client = arxiv.Client(
+            page_size=50,
+            delay_seconds=3,
+            num_retries=3
+        )
+    def build_query(self, session: Session) -> str:
+        """Builds a targeted arXiv query using AppSettings or defaults."""
+        settings = session.get(AppSettings, 1)
+        # Defaults
+        default_categories = ['cs.CV', 'cs.LG', 'cs.CL']
+        default_focus = (
+            '((ti:transformer OR abs:transformer OR ti:diffusion OR abs:diffusion OR ti:DiT OR abs:DiT) AND '
+            '(ti:"kv cache" OR abs:"kv cache" OR ti:compression OR abs:compression OR ti:pruning OR abs:pruning OR '
+            'ti:quantization OR abs:quantization OR ti:sparse OR abs:sparse OR ti:"token merging" OR abs:"token merging" OR '
+            'ti:distillation OR abs:distillation OR ti:efficiency OR abs:efficiency))'
+        )
+        categories = default_categories
+        focus_query = default_focus
+        if settings:
+            if settings.arxiv_categories:
+                categories = settings.arxiv_categories
+            # Use focus_keywords if available, otherwise fallback to research_focus string or default
+            if settings.focus_keywords and (not settings.research_focus or ";" in settings.research_focus):
+                # Construct OR logic for keywords: (all:k1) OR (all:"k 2")
+                keywords_parts = []
+                for k in settings.focus_keywords:
+                    # Wrap in quotes if it contains spaces and isn't already quoted
+                    if " " in k and not (k.startswith('"') and k.endswith('"')):
+                        term = f'"{k}"'
+                    else:
+                        term = k
+                    keywords_parts.append(f'(all:{term})')
+                if keywords_parts:
+                    focus_query = f"({' OR '.join(keywords_parts)})"
+            elif settings.research_focus and settings.research_focus.strip():
+                # Fallback to the raw string if keywords list is empty but string exists (backward compatibility)
+                focus_query = f"({settings.research_focus})"
+        # 1. Categories
+        cat_query = "(" + " OR ".join([f"cat:{c}" for c in categories]) + ")"
+        # 2. Combine
+        final_query = f"{cat_query} AND {focus_query}"
+        return final_query
+    def fetch_recent_papers(
+        self,
+        session: Session,
+        max_results: int = 50,
+        hours_back: int = 168,  # 7 days to catch weekly arXiv updates
+    ) -> List[PaperCreate]:
+        """Fetch recent targeted papers from arXiv."""
+        query = self.build_query(session)
+        print(f"Executing Arxiv Query: {query}")
+        search = arxiv.Search(
+            query=query,
+            max_results=max_results,
+            sort_by=arxiv.SortCriterion.SubmittedDate,
+            sort_order=arxiv.SortOrder.Descending,
+        )
+        # Use UTC aware time for comparison
+        cutoff_date = datetime.now(timezone.utc) - timedelta(hours=hours_back)
+        papers = []
+        for result in self.client.results(search):
+            # Arxiv dates are timezone aware (UTC)
+            if result.published < cutoff_date:
+                # Since results are sorted descending, we can stop early
+                break
+            paper = PaperCreate(
+                arxiv_id=result.entry_id.split("/")[-1],
+                title=result.title.replace("\n", " ").strip(),
+                authors=", ".join([author.name for author in result.authors]),
+                abstract=result.summary.replace("\n", " ").strip(),
+                categories=", ".join(result.categories),
+                # Convert to naive UTC for DB storage
+                published=result.published.astimezone(timezone.utc).replace(tzinfo=None),
+                updated=result.updated.astimezone(timezone.utc).replace(tzinfo=None),
+                pdf_url=result.pdf_url,
+            )
+            papers.append(paper)
+        return papers
+    def save_paper(self, session: Session, paper_data: PaperCreate) -> Optional[Paper]:
+        """Save a paper to database if not exists."""
+        existing = session.exec(
+            select(Paper).where(Paper.arxiv_id == paper_data.arxiv_id)
+        ).first()
+        if existing:
+            return None
+        paper = Paper(**paper_data.model_dump())
+        session.add(paper)
+        session.commit()
+        session.refresh(paper)
+        return paper
+    async def process_paper(self, session: Session, paper: Paper) -> bool:
+        """Process a paper with LLM analysis and thumbnail generation."""
+        if paper.is_processed:
+            return False
+        try:
+            paper.processing_status = "processing"
+            session.add(paper)
+            session.commit()
+            session.refresh(paper)
+            # Get the appropriate analysis client
+            client, client_type = _get_analysis_client()
+            settings = session.get(AppSettings, 1)
+            system_prompt_override = settings.system_prompt if settings else None
+            # Execute analysis based on client type
+            loop = asyncio.get_running_loop()
+            if client_type == "dify":
+                # Use Dify client (async, non-streaming for batch processing)
+                result = await client.analyze_paper(
+                    paper.title,
+                    paper.abstract,
+                    user_id=f"batch-paper-{paper.id}",
+                )
+                if result:
+                    analysis = client.to_llm_analysis(result)
+                else:
+                    analysis = None
+            else:
+                # Use legacy DeepSeek client (sync, run in executor)
+                analysis = await loop.run_in_executor(
+                    None,
+                    client.analyze_paper,
+                    paper.title,
+                    paper.abstract,
+                    system_prompt_override,
+                )
+            thumbnail_url = await generate_thumbnail(paper.arxiv_id, paper.pdf_url)
+            # Update thumbnail regardless of relevance (visuals are good)
+            if thumbnail_url:
+                paper.thumbnail_url = thumbnail_url
+            if analysis:
+                paper.summary_zh = analysis.summary_zh
+                paper.relevance_score = analysis.relevance_score
+                paper.relevance_reason = analysis.relevance_reason
+                paper.heuristic_idea = analysis.heuristic_idea
+                paper.is_processed = True
+                paper.processed_at = datetime.utcnow()
+                if analysis.relevance_score >= 9:
+                    paper.processing_status = "processed"
+                elif analysis.relevance_score >= 5:
+                    paper.processing_status = "processed"
+                else:
+                    paper.processing_status = "skipped"
+                session.add(paper)
+                session.commit()
+                return True
+            paper.processing_status = "failed"
+            session.add(paper)
+            session.commit()
+            session.refresh(paper)
+        except Exception as e:
+            print(f"Error processing paper {paper.arxiv_id}: {e}")
+            paper.processing_status = "failed"
+            session.add(paper)
+            session.commit()
+        return False
+async def run_daily_fetch_async():
+    """Async wrapper for daily fetch logic."""
+    print(f"[{datetime.now()}] Starting daily paper fetch...")
+    bot = ArxivBot()
+    session = get_sync_session()
+    try:
+        # Fetch new papers (Sync)
+        # Using run_in_executor to avoid blocking the event loop if this takes time
+        # We pass the session to fetch_recent_papers
+        loop = asyncio.get_running_loop()
+        papers = await loop.run_in_executor(None, bot.fetch_recent_papers, session, 50, 168)
+        print(f"Fetched {len(papers)} papers from arXiv")
+        # Save to database (Sync)
+        saved_count = 0
+        for paper_data in papers:
+            paper = bot.save_paper(session, paper_data)
+            if paper:
+                saved_count += 1
+        print(f"Saved {saved_count} new papers to database")
+        # Process unprocessed papers (Async)
+        unprocessed = session.exec(
+            select(Paper).where(Paper.is_processed == False)
+        ).all()
+        print(f"Processing {len(unprocessed)} unprocessed papers...")
+        processed_count = 0
+        for paper in unprocessed:
+            # Await the async process
+            if await bot.process_paper(session, paper):
+                processed_count += 1
+                print(f"  Processed: {paper.title[:50]}...")
+        print(f"Processed {processed_count} papers with LLM analysis")
+    except Exception as e:
+        print(f"Error in daily fetch: {e}")
+    finally:
+        session.close()
+    print(f"[{datetime.now()}] Daily fetch completed")
+def run_daily_fetch():
+    """Entry point that runs the async fetcher in a loop."""
+    asyncio.run(run_daily_fetch_async())
+# Singleton instance
+_arxiv_bot: Optional[ArxivBot] = None
+def get_arxiv_bot() -> ArxivBot:
+    """Get or create ArxivBot singleton."""
+    global _arxiv_bot
+    if _arxiv_bot is None:
+        _arxiv_bot = ArxivBot()
+    return _arxiv_bot

app/services/dify_client.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""
+Dify Workflow API Client with Streaming Support.
+This module replaces the DeepSeek direct API calls with Dify Chatflow API,
+supporting streaming responses for long-running R1 reasoning processes.
+"""
+import os
+import json
+import httpx
+from typing import Optional, AsyncGenerator, Dict, Any
+from dataclasses import dataclass
+from dotenv import load_dotenv
+from app.models import LLMAnalysis
+load_dotenv()
+@dataclass
+class DifyStreamEvent:
+    """Represents a single event from Dify's streaming response."""
+    event: str
+    data: Dict[str, Any]
+    thought: Optional[str] = None
+    answer: Optional[str] = None
+    outputs: Optional[Dict[str, Any]] = None
+@dataclass
+class TechnicalMapping:
+    """Technical mapping analysis from Dify workflow."""
+    token_vs_patch: str = ""
+    temporal_logic: str = ""
+    frequency_domain: str = ""
+@dataclass
+class DifyAnalysisResult:
+    """Complete analysis result from Dify workflow."""
+    summary_zh: str
+    relevance_score: float
+    relevance_reason: str
+    technical_mapping: TechnicalMapping
+    heuristic_idea: str
+    thought_process: Optional[str] = None  # R1 thinking process
+class DifyClientError(Exception):
+    """Base exception for Dify client errors."""
+    pass
+class DifyEntityTooLargeError(DifyClientError):
+    """Raised when request payload exceeds Dify's limit (413)."""
+    pass
+class DifyTimeoutError(DifyClientError):
+    """Raised when request times out."""
+    pass
+class DifyRateLimitError(DifyClientError):
+    """Raised when rate limit is exceeded (429)."""
+    pass
+class DifyClient:
+    """Dify Chatflow API client with streaming support."""
+    def __init__(self):
+        self.api_key = os.getenv("DIFY_API_KEY")
+        if not self.api_key:
+            raise ValueError("DIFY_API_KEY environment variable is not set")
+        self.base_url = os.getenv("DIFY_API_BASE", "http://82.157.209.193:8080/v1")
+        self.endpoint = f"{self.base_url}/chat-messages"
+        self.timeout = httpx.Timeout(120.0, connect=10.0)  # 2 min for R1 reasoning
+    def _format_query(
+        self,
+        topic: str,
+        background: str,
+        method: str,
+        contribution: str,
+    ) -> str:
+        """Format input according to Dify workflow variable specification."""
+        return f"""研究主题：{topic}
+技术背景：{background}
+核心方法：{method}
+预期贡献：{contribution}"""
+    def _build_request_body(
+        self,
+        query: str,
+        user_id: str = "paper-insight-user",
+        conversation_id: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Build the request body for Dify API."""
+        body = {
+            "inputs": {
+                "query": query,
+            },
+            "query": query,  # Also send as direct query for compatibility
+            "response_mode": "streaming",
+            "user": user_id,
+        }
+        if conversation_id:
+            body["conversation_id"] = conversation_id
+        return body
+    def _get_headers(self) -> Dict[str, str]:
+        """Get request headers with authentication."""
+        return {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+    async def analyze_paper_stream(
+        self,
+        title: str,
+        abstract: str,
+        user_id: str = "paper-insight-user",
+    ) -> AsyncGenerator[DifyStreamEvent, None]:
+        """
+        Analyze a paper using Dify workflow with streaming.
+        Yields DifyStreamEvent objects for each SSE event received.
+        """
+        # Format the query using paper information
+        query = self._format_query(
+            topic=title,
+            background="arXiv论文，需要分析其与DiT/KV Cache研究的相关性",
+            method=abstract[:500] if len(abstract) > 500 else abstract,
+            contribution="待分析",
+        )
+        body = self._build_request_body(query, user_id)
+        headers = self._get_headers()
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            try:
+                async with client.stream(
+                    "POST",
+                    self.endpoint,
+                    json=body,
+                    headers=headers,
+                ) as response:
+                    # Handle error responses
+                    if response.status_code == 413:
+                        raise DifyEntityTooLargeError(
+                            "Request payload too large. Consider shortening the abstract."
+                        )
+                    elif response.status_code == 429:
+                        raise DifyRateLimitError(
+                            "Rate limit exceeded. Please try again later."
+                        )
+                    elif response.status_code >= 400:
+                        error_text = await response.aread()
+                        raise DifyClientError(
+                            f"Dify API error {response.status_code}: {error_text.decode()}"
+                        )
+                    # Parse SSE stream
+                    buffer = ""
+                    async for chunk in response.aiter_text():
+                        buffer += chunk
+                        # Process complete SSE events
+                        while "\n\n" in buffer:
+                            event_str, buffer = buffer.split("\n\n", 1)
+                            event = self._parse_sse_event(event_str)
+                            if event:
+                                yield event
+            except httpx.TimeoutException as e:
+                raise DifyTimeoutError(f"Request timed out: {e}")
+            except httpx.RequestError as e:
+                raise DifyClientError(f"Request failed: {e}")
+    def _parse_sse_event(self, event_str: str) -> Optional[DifyStreamEvent]:
+        """Parse a single SSE event string into DifyStreamEvent."""
+        lines = event_str.strip().split("\n")
+        event_type = ""
+        data_str = ""
+        for line in lines:
+            if line.startswith("event:"):
+                event_type = line[6:].strip()
+            elif line.startswith("data:"):
+                data_str = line[5:].strip()
+        if not data_str:
+            return None
+        try:
+            data = json.loads(data_str)
+        except json.JSONDecodeError:
+            return None
+        event = DifyStreamEvent(
+            event=event_type or data.get("event", ""),
+            data=data,
+        )
+        # Extract common fields
+        if "thought" in data:
+            event.thought = data["thought"]
+        if "answer" in data:
+            event.answer = data["answer"]
+        if "outputs" in data:
+            event.outputs = data["outputs"]
+        return event
+    async def analyze_paper(
+        self,
+        title: str,
+        abstract: str,
+        user_id: str = "paper-insight-user",
+    ) -> Optional[DifyAnalysisResult]:
+        """
+        Analyze a paper and return the complete result.
+        This method consumes the entire stream and returns the final result.
+        Use analyze_paper_stream() for real-time streaming updates.
+        """
+        thought_parts = []
+        answer_parts = []
+        final_outputs = None
+        try:
+            async for event in self.analyze_paper_stream(title, abstract, user_id):
+                if event.thought:
+                    thought_parts.append(event.thought)
+                if event.answer:
+                    answer_parts.append(event.answer)
+                if event.outputs:
+                    final_outputs = event.outputs
+                # Check for workflow completion
+                if event.event == "workflow_finished" and event.outputs:
+                    final_outputs = event.outputs
+            # Parse the final outputs
+            if final_outputs:
+                return self._parse_outputs(final_outputs, "".join(thought_parts))
+            # Try to parse from answer if outputs not available
+            full_answer = "".join(answer_parts)
+            if full_answer:
+                return self._parse_answer(full_answer, "".join(thought_parts))
+            return None
+        except DifyClientError as e:
+            print(f"Dify analysis error: {e}")
+            return None
+    def _parse_outputs(
+        self,
+        outputs: Dict[str, Any],
+        thought_process: str = "",
+    ) -> DifyAnalysisResult:
+        """Parse Dify workflow outputs into DifyAnalysisResult."""
+        technical_mapping = TechnicalMapping()
+        if "technical_mapping" in outputs:
+            tm = outputs["technical_mapping"]
+            if isinstance(tm, dict):
+                technical_mapping = TechnicalMapping(
+                    token_vs_patch=tm.get("token_vs_patch", ""),
+                    temporal_logic=tm.get("temporal_logic", ""),
+                    frequency_domain=tm.get("frequency_domain", ""),
+                )
+        return DifyAnalysisResult(
+            summary_zh=outputs.get("summary_zh", ""),
+            relevance_score=float(outputs.get("relevance_score", 0)),
+            relevance_reason=outputs.get("relevance_reason", ""),
+            technical_mapping=technical_mapping,
+            heuristic_idea=outputs.get("heuristic_idea", ""),
+            thought_process=thought_process if thought_process else None,
+        )
+    def _parse_answer(
+        self,
+        answer: str,
+        thought_process: str = "",
+    ) -> Optional[DifyAnalysisResult]:
+        """Parse answer string (JSON) into DifyAnalysisResult."""
+        try:
+            data = json.loads(answer)
+            return self._parse_outputs(data, thought_process)
+        except json.JSONDecodeError:
+            # If not JSON, try to extract fields manually
+            return DifyAnalysisResult(
+                summary_zh=answer[:200] if answer else "",
+                relevance_score=0,
+                relevance_reason="无法解析结构化输出",
+                technical_mapping=TechnicalMapping(),
+                heuristic_idea="",
+                thought_process=thought_process if thought_process else None,
+            )
+    def to_llm_analysis(self, result: DifyAnalysisResult) -> LLMAnalysis:
+        """Convert DifyAnalysisResult to legacy LLMAnalysis model."""
+        # Combine technical mapping into heuristic_idea for backward compatibility
+        tech_mapping_str = ""
+        if result.technical_mapping:
+            parts = []
+            if result.technical_mapping.token_vs_patch:
+                parts.append(f"Token/Patch映射: {result.technical_mapping.token_vs_patch}")
+            if result.technical_mapping.temporal_logic:
+                parts.append(f"时序逻辑: {result.technical_mapping.temporal_logic}")
+            if result.technical_mapping.frequency_domain:
+                parts.append(f"频域分析: {result.technical_mapping.frequency_domain}")
+            if parts:
+                tech_mapping_str = "\n\n【技术映射】\n" + "\n".join(parts)
+        heuristic_with_mapping = result.heuristic_idea + tech_mapping_str
+        return LLMAnalysis(
+            summary_zh=result.summary_zh,
+            relevance_score=result.relevance_score,
+            relevance_reason=result.relevance_reason,
+            heuristic_idea=heuristic_with_mapping,
+        )
+# Singleton instance
+_dify_client: Optional[DifyClient] = None
+def get_dify_client() -> DifyClient:
+    """Get or create DifyClient singleton."""
+    global _dify_client
+    if _dify_client is None:
+        _dify_client = DifyClient()
+    return _dify_client

app/services/llm_brain.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import json
+from typing import Optional
+from openai import OpenAI
+from dotenv import load_dotenv
+from app.models import LLMAnalysis
+load_dotenv()
+SYSTEM_PROMPT = """你是一名资深 AI 研究员，专注于 **Autoregressive Diffusion Transformers (DiT)** 的推理加速与 **KV Cache 压缩**。
+你的核心能力是**跨领域技术迁移**：你能敏锐地从 LLM（大语言模型）或 ViT（视觉 Transformer）的优化论文中，提取出能应用到 DiT 视频/图像生成上的灵感。
+你的任务是阅读给定的论文摘要，并按以下 JSON 格式输出分析结果：
+{
+    "summary_zh": "中文一句话概括核心贡献（直击痛点，如：'提出了一种基于Token重要性的动态剪枝方法，减少50% FLOPs'）。",
+    "relevance_score": 0-10 评分。
+    "relevance_reason": "简述评分理由。如果是 DiT 相关给高分；如果是 LLM KV Cache 相关，评估其迁移潜力。",
+    "heuristic_idea": "【核心价值】这是最重要的部分。请进行逻辑推演和思维发散：\n1. **如果这是 LLM 的论文**：它的 'Token' 对应 DiT 的 'Patch' 吗？它的 '序列长度' 对应 DiT 的 '时间步(Timestep)' 还是 '空间分辨率'？\n2. **如果这是剪枝/量化**：DiT 的扩散过程中，早期和晚期的时间步对精度的敏感度不同，这篇论文的方法能利用这一点吗？\n3. **具体建议**：给出一个具体的、可实验的 Idea（例如：'尝试将此文的 Window Attention 机制应用到 DiT 的前 50% 去噪步中'）。"
+}
+评分标准：
+- **9-10**: 直接针对 DiT/Video Diffusion 的加速/缓存优化。
+- **7-8**: 高质量的 LLM KV Cache、ViT 剪枝、Token Merging 论文，且迁移路径清晰。
+- **4-6**: 通用的 Transformer 量化/硬件加速，参考价值中等。
+- **0-3**: 纯 NLP 任务（如 RAG、Prompt Engineering）或与生成/架构无关。
+注意：必须返回纯 JSON 格式，无额外文本。"""
+class LLMBrain:
+    """DeepSeek LLM client for paper analysis."""
+    def __init__(self):
+        api_key = os.getenv("DEEPSEEK_API_KEY")
+        if not api_key:
+            raise ValueError("DEEPSEEK_API_KEY environment variable is not set")
+        self.client = OpenAI(
+            api_key=api_key,
+            base_url=os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
+        )
+        self.model = os.getenv("DEEPSEEK_MODEL", "deepseek-chat")
+    def analyze_paper(
+        self,
+        title: str,
+        abstract: str,
+        system_prompt_override: Optional[str] = None,
+    ) -> Optional[LLMAnalysis]:
+        """Analyze a paper and return structured analysis."""
+        user_prompt = f"""请分析以下论文：
+标题: {title}
+摘要: {abstract}
+请按照系统提示中的JSON格式返回分析结果。"""
+        system_prompt = SYSTEM_PROMPT
+        if system_prompt_override:
+            system_prompt = f"{SYSTEM_PROMPT}\n\n用户补充要求：\n{system_prompt_override}"
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                max_tokens=1500,
+                temperature=0.3,
+                response_format={"type": "json_object"},
+            )
+            content = response.choices[0].message.content
+            if not content:
+                return None
+            data = json.loads(content)
+            return LLMAnalysis(
+                summary_zh=data.get("summary_zh", ""),
+                relevance_score=float(data.get("relevance_score", 0)),
+                relevance_reason=data.get("relevance_reason", ""),
+                heuristic_idea=data.get("heuristic_idea", ""),
+            )
+        except json.JSONDecodeError as e:
+            print(f"JSON parsing error: {e}")
+            return None
+        except Exception as e:
+            print(f"LLM analysis error: {e}")
+            return None
+# Singleton instance
+_llm_brain: Optional[LLMBrain] = None
+def get_llm_brain() -> LLMBrain:
+    """Get or create LLMBrain singleton."""
+    global _llm_brain
+    if _llm_brain is None:
+        _llm_brain = LLMBrain()
+    return _llm_brain

app/services/pdf_renderer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import fitz  # pymupdf
+import httpx
+from pathlib import Path
+from typing import Optional
+import asyncio
+# Constants
+STATIC_DIR = Path(__file__).parent.parent / "static"
+THUMBNAILS_DIR = STATIC_DIR / "thumbnails"
+# Ensure directories exist
+THUMBNAILS_DIR.mkdir(parents=True, exist_ok=True)
+async def generate_thumbnail(arxiv_id: str, pdf_url: str) -> Optional[str]:
+    """
+    Generates a JPG thumbnail from the first page of an arXiv PDF.
+    Args:
+        arxiv_id: The arXiv ID of the paper (used for filename).
+        pdf_url: The URL to download the PDF from.
+    Returns:
+        str: Relative URL path to the thumbnail (e.g., "/static/thumbnails/1234.5678.jpg")
+        None: If generation fails.
+    """
+    filename = f"{arxiv_id}.jpg"
+    file_path = THUMBNAILS_DIR / filename
+    relative_url = f"/static/thumbnails/{filename}"
+    # 1. Check cache
+    if file_path.exists():
+        return relative_url
+    try:
+        # 2. Download PDF
+        async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
+            # Arxiv often requires a user agent
+            headers = {
+                "User-Agent": "PaperInsight/1.0 (mailto:your-email@example.com)"
+            }
+            # Modify URL to ensure direct PDF link if needed (arxiv often redirects /abs/ to /pdf/)
+            # Usually input pdf_url is already correct (e.g. http://arxiv.org/pdf/2312.00001v1)
+            response = await client.get(pdf_url, headers=headers)
+            response.raise_for_status()
+            pdf_data = response.content
+        # 3. Render Thumbnail (offload CPU-bound work)
+        success = await asyncio.to_thread(_render_thumbnail, pdf_data, file_path)
+        if not success:
+            return None
+        return relative_url
+    except httpx.HTTPStatusError as e:
+        print(f"Failed to download PDF for {arxiv_id}: {e}")
+        return None
+    except Exception as e:
+        print(f"Error generating thumbnail for {arxiv_id}: {e}")
+        return None
+def _render_thumbnail(pdf_data: bytes, file_path: Path) -> bool:
+    """Render the first page of a PDF to a thumbnail on disk."""
+    doc = None
+    try:
+        doc = fitz.open(stream=pdf_data, filetype="pdf")
+        if len(doc) < 1:
+            return False
+        page = doc[0]
+        # Matrix(1.5, 1.5) increases resolution for better quality
+        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
+        pix.save(str(file_path))
+        return True
+    except Exception as e:
+        print(f"Error rendering thumbnail: {e}")
+        return False
+    finally:
+        if doc is not None:
+            doc.close()

backfill_thumbnails.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import asyncio
+import sys
+from pathlib import Path
+# Add backend directory to python path so we can import app modules
+sys.path.append(str(Path(__file__).parent))
+from sqlmodel import select
+from app.database import get_sync_session
+from app.models import Paper
+from app.services.pdf_renderer import generate_thumbnail
+async def backfill_thumbnails():
+    print("Starting thumbnail backfill...")
+    session = get_sync_session()
+    try:
+        # Find papers without thumbnails
+        statement = select(Paper).where(Paper.thumbnail_url == None)
+        papers = session.exec(statement).all()
+        print(f"Found {len(papers)} papers needing thumbnails.")
+        for i, paper in enumerate(papers):
+            print(f"[{i+1}/{len(papers)}] Processing {paper.arxiv_id}...")
+            if not paper.pdf_url:
+                print(f"  Skipping {paper.arxiv_id}: No PDF URL")
+                continue
+            thumbnail_url = await generate_thumbnail(paper.arxiv_id, paper.pdf_url)
+            if thumbnail_url:
+                paper.thumbnail_url = thumbnail_url
+                session.add(paper)
+                session.commit()
+                print(f"  Generated: {thumbnail_url}")
+            else:
+                print(f"  Failed to generate thumbnail for {paper.arxiv_id}")
+    except Exception as e:
+        print(f"Error during backfill: {e}")
+    finally:
+        session.close()
+        print("Backfill completed.")
+if __name__ == "__main__":
+    asyncio.run(backfill_thumbnails())

main.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+from typing import Any, Dict, List, Optional
+import psycopg2
+import psycopg2.extras
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+DATABASE_URL = os.getenv("DATABASE_URL", "")
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origin_regex=r"^https://.*\.github\.io$",
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+def root() -> Dict[str, str]:
+    return {"status": "ok"}
+@app.get("/health")
+def health() -> Dict[str, str]:
+    return {"status": "ok"}
+def fetch_papers(limit: int, offset: int) -> List[Dict[str, Any]]:
+    if not DATABASE_URL:
+        raise HTTPException(status_code=500, detail="DATABASE_URL is not set")
+    table_candidates = ["paper", "papers"]
+    last_error: Optional[Exception] = None
+    for table in table_candidates:
+        try:
+            with psycopg2.connect(DATABASE_URL) as conn:
+                with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                    cur.execute(
+                        f"SELECT * FROM {table} ORDER BY published DESC NULLS LAST LIMIT %s OFFSET %s",
+                        (limit, offset),
+                    )
+                    rows = cur.fetchall()
+                    return [dict(row) for row in rows]
+        except psycopg2.errors.UndefinedTable as exc:
+            last_error = exc
+            continue
+        except Exception as exc:
+            last_error = exc
+            break
+    detail = str(last_error) if last_error else "paper table not found"
+    raise HTTPException(status_code=500, detail=f"Database error: {detail}")
+@app.get("/papers")
+def papers(
+    limit: int = Query(20, ge=1, le=100),
+    offset: int = Query(0, ge=0),
+) -> List[Dict[str, Any]]:
+    return fetch_papers(limit, offset)

paper_insight.db ADDED Viewed

Binary file (20.5 kB). View file

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[project]
+name = "backend"
+version = "0.1.0"
+description = "Paper Insight - arXiv paper fetching and summarization API"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "apscheduler>=3.11.2",
+    "arxiv>=2.3.1",
+    "fastapi>=0.128.0",
+    "httpx>=0.28.1",
+    "openai>=2.14.0",
+    "psycopg2-binary>=2.9.11",
+    "pymupdf>=1.26.7",
+    "python-dotenv>=1.2.1",
+    "socksio>=1.0.0",
+    "sqlmodel>=0.0.31",
+    "uvicorn>=0.40.0",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff