Spaces:

YuITC
/

arXivRAG-Multimodal-Conversational-RAG-System

Runtime error

App Files Files Community

YuITC commited on Jul 14, 2025

Commit

c8e875f

1 Parent(s): 05877ff

Add application file

Browse files

Files changed (45) hide show

.dockerignore +59 -0
Dockerfile +59 -0
app.py +19 -0
src/__init__.py +3 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/api.cpython-310.pyc +0 -0
src/__pycache__/config.cpython-310.pyc +0 -0
src/api.py +308 -0
src/config.py +43 -0
src/data_extraction/__init__.py +3 -0
src/data_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
src/data_extraction/__pycache__/extractor.cpython-310.pyc +0 -0
src/data_extraction/extractor.py +51 -0
src/fetcher/__init__.py +3 -0
src/fetcher/__pycache__/__init__.cpython-310.pyc +0 -0
src/fetcher/__pycache__/arxiv_fetcher.cpython-310.pyc +0 -0
src/fetcher/arxiv_cs_subjects.json +162 -0
src/fetcher/arxiv_fetcher.py +118 -0
src/processors/__init__.py +3 -0
src/processors/__pycache__/__init__.cpython-310.pyc +0 -0
src/processors/__pycache__/image_processor.cpython-310.pyc +0 -0
src/processors/__pycache__/prompts.cpython-310.pyc +0 -0
src/processors/__pycache__/table_processor.cpython-310.pyc +0 -0
src/processors/__pycache__/text_processor.cpython-310.pyc +0 -0
src/processors/image_processor.py +64 -0
src/processors/prompts.py +101 -0
src/processors/table_processor.py +57 -0
src/processors/text_processor.py +57 -0
src/rag/__init__.py +3 -0
src/rag/__pycache__/__init__.cpython-310.pyc +0 -0
src/rag/__pycache__/pipeline.cpython-310.pyc +0 -0
src/rag/pipeline.py +149 -0
src/storage/__init__.py +3 -0
src/storage/__pycache__/__init__.cpython-310.pyc +0 -0
src/storage/__pycache__/vectorstore.cpython-310.pyc +0 -0
src/storage/vectorstore.py +151 -0
static/css/modern-styles.css +1177 -0
static/data/arxiv_cs_subjects.json +162 -0
static/index.html +304 -0
static/js/api.js +219 -0
static/js/chat.js +198 -0
static/js/main.js +196 -0
static/js/ui.js +364 -0
utils/__pycache__/setup_logger.cpython-310.pyc +0 -0
utils/setup_logger.py +32 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,59 @@

+# Exclude files and directories from Docker build
+# Version control
+.git
+.gitignore
+.github
+.gitattributes
+# Python artifacts
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environment
+venv/
+ENV/
+env/
+# IDE artifacts
+.idea/
+.vscode/
+*.swp
+*.swo
+# Docker artifacts
+# .dockerignore
+# Dockerfile
+# docker-compose.yml
+# Documentation
+docs/
+*.md
+LICENSE
+# Misc
+.DS_Store
+**/*.log
+**/.env*
+# Others
+*.ipynb
+LICENCE
+assets/

Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+FROM python:3.10-slim as builder
+WORKDIR /app
+# Install build dependencies and runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    gcc \
+    g++ \
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements file
+COPY requirements.txt .
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+    # pip install --no-cache-dir --user -r requirements.txt
+# Second stage: runtime image
+FROM python:3.10-slim
+WORKDIR /app
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    poppler-utils \
+    tesseract-ocr \
+    libreoffice \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the installed packages from the builder stage
+COPY --from=builder /root/.local /root/.local
+# Make sure scripts in .local are usable:
+ENV PATH=/root/.local/bin:$PATH
+# Copy application code
+COPY . .
+# Create necessary directories
+RUN mkdir -p /app/data/temp
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PORT=8000
+# Expose the port
+EXPOSE 8000
+# Set a non-root user
+RUN useradd -m appuser
+RUN chown -R appuser:appuser /app
+USER appuser
+# Run the application
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Main entry point for the arXivCSRAG application.
+"""
+import os
+import uvicorn
+from dotenv import load_dotenv
+from utils.setup_logger import setup_logger
+load_dotenv()
+logger = setup_logger(__name__)
+if __name__ == '__main__':
+    from src.api import app
+    port = int(os.environ.get('PORT', 8000))
+    logger.info(f"Starting arXivCSRAG Application on port {port}...")
+    uvicorn.run(app, host='0.0.0.0', port=port)

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+arXivCSRAG Application source code.
+"""

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (215 Bytes). View file

src/__pycache__/api.cpython-310.pyc ADDED Viewed

Binary file (8.8 kB). View file

src/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.08 kB). View file

src/api.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""
+FastAPI backend for the arXivCSRAG application.
+"""
+import os
+from typing                  import List, Optional
+from pathlib                 import Path
+from datetime                import datetime
+from pydantic                import BaseModel
+from fastapi                 import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.staticfiles     import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from utils.setup_logger             import setup_logger
+from src.config                     import TEMP_DIR, ROOT_DIR
+from src.fetcher.arxiv_fetcher      import ArxivFetcher
+from src.data_extraction.extractor  import extract_from_pdf, separate_content_types
+from src.processors.text_processor  import TextProcessor
+from src.processors.table_processor import TableProcessor
+from src.processors.image_processor import ImageProcessor
+from src.storage.vectorstore        import VectorStore
+from src.rag.pipeline               import RAGPipeline
+# Configure logging
+logger = setup_logger(__name__)
+# Initialize the FastAPI app
+app = FastAPI(
+    title       = 'arXivCSRAG API',
+    description = 'API for the arXivCSRAG Multimodal RAG Application',
+    version     = '1.0.0',
+)
+# CORS configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins     = ['*'],
+    allow_credentials = True,
+    allow_methods     = ['*'],
+    allow_headers     = ['*'],
+)
+# Models
+class APIKeys(BaseModel):
+    gemini_api_key   : str
+    huggingface_token: str
+class SearchQuery(BaseModel):
+    subject_tags: Optional[List[str]] = None
+    start_date  : Optional[str]       = None
+    end_date    : Optional[str]       = None
+    max_results : int                 = 10
+    query       : str
+class PaperID(BaseModel):
+    arxiv_id: str
+class ChatMessage(BaseModel):
+    message: str
+# Initialize components
+arxiv_fetcher   = ArxivFetcher()
+text_processor  = TextProcessor()
+table_processor = TableProcessor()
+image_processor = ImageProcessor()
+vector_store    = VectorStore()
+rag_pipeline    = RAGPipeline(vector_store.retriever)
+# API endpoints
+@app.post('/api/configure')
+async def configure_api_keys(api_keys: APIKeys):
+    """Configure API keys for the application."""
+    try:
+        # Set environment variables
+        os.environ['GOOGLE_API_KEY'] = api_keys.gemini_api_key
+        os.environ['HF_TOKEN']       = api_keys.huggingface_token
+        logger.info('API keys configured successfully')
+        return {'status' : 'success',
+                'message': 'API keys configured successfully'}
+    except Exception as e:
+        logger.error(f"Error configuring API keys: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/fetch-papers')
+async def fetch_papers(search_query: SearchQuery):
+    """Fetch papers from arXiv based on search query and filters."""
+    try:
+        papers = arxiv_fetcher.fetch_papers(
+            subject_tags = search_query.subject_tags,
+            start_date   = search_query.start_date,
+            end_date     = search_query.end_date,
+            max_results  = search_query.max_results,
+            query        = search_query.query
+        )
+        logger.info(f"Fetched {len(papers)} papers")
+        return {'status': 'success', 'papers': papers}
+    except Exception as e:
+        logger.error(f"Error fetching papers: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/paper-metadata')
+async def get_paper_metadata(paper_id: PaperID):
+    """Get metadata for a specific paper."""
+    try:
+        search = arxiv_fetcher.fetch_papers(f"id:{paper_id.arxiv_id}", max_results=1)
+        if not search:
+            raise HTTPException(status_code=404, detail='Paper not found')
+        return {'status': 'success', 'metadata': search[0]}
+    except Exception as e:
+        logger.error(f"Error getting paper metadata: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/download-paper')
+async def download_paper(paper_id: PaperID):
+    """Download a paper's PDF from arXiv."""
+    try:
+        pdf_path = arxiv_fetcher.download_paper(paper_id.arxiv_id)
+        if not pdf_path:
+            raise HTTPException(status_code=404, detail="Failed to download paper")
+        logger.info(f"Downloaded paper {paper_id.arxiv_id} to {pdf_path}")
+        return {'status': 'success', 'file_path': str(pdf_path)}
+    except Exception as e:
+        logger.error(f"Error downloading paper: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/upload-paper')
+async def upload_paper(file: UploadFile = File(...)):
+    """Upload a paper's PDF file."""
+    try:
+        # Create a unique filename
+        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
+        filename  = f"uploaded_{timestamp}.pdf"
+        filepath  = TEMP_DIR / filename
+        # Save the uploaded file
+        with open(filepath, 'wb') as f:
+            f.write(await file.read())
+        logger.info(f"Uploaded paper saved at {filepath}")
+        return {'status': 'success', 'file_path': str(filepath)}
+    except Exception as e:
+        logger.error(f"Error uploading paper: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/process-paper')
+async def process_paper(file_path: str = Form(...)):
+    """Process a paper for RAG."""
+    try:
+        # # Reset the vector store
+        # vector_store.reset()
+        # # Set the new retriever for the RAG pipeline
+        # rag_pipeline.retriever = vector_store.retriever
+        # Process the paper
+        pdf_path = Path(file_path)
+        logger.info(f"Processing paper at {pdf_path}")
+        if not pdf_path.exists():
+            raise HTTPException(status_code=404, detail='PDF file not found')
+        # Extract content from PDF
+        logger.info(f"Extracting content from {pdf_path}")
+        chunks = extract_from_pdf(pdf_path)
+        # Separate content types
+        logger.info(f"Separating content types from {len(chunks)} chunks")
+        content = separate_content_types(chunks)
+        # Process and summarize content
+        logger.info(f"Processing {len(content['texts'])} text content")
+        text_summaries  = text_processor.process(content['texts'])
+        logger.info(f"Processing {len(content['tables'])} table content")
+        table_summaries = table_processor.process(content['tables'])
+        logger.info(f"Processing {len(content['images'])} image content")
+        image_summaries = image_processor.process(content['images'])
+        # Add to vector store
+        logger.info("Adding processed content to vector store")
+        vector_store.add_contents(
+            content['texts'] , text_summaries,
+            content['tables'], table_summaries,
+            content['images'], image_summaries
+        )
+        logger.info(f"Processed paper {pdf_path.name} successfully")
+        return {
+            'status': 'success',
+            'stats' : {
+                'texts' : len(content['texts']),
+                'tables': len(content['tables']),
+                'images': len(content['images'])
+            }
+        }
+    except Exception as e:
+        logger.error(f"Error processing paper: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/chat')
+async def chat_with_paper(message: ChatMessage):
+    """
+    Chat with a processed paper.
+    Returns:
+        - status: success or error
+        - response: The generated text response
+        - citations: Dictionary containing three keys:
+            - texts: List of text excerpts used as citations
+            - images: List of base64-encoded image strings
+            - tables: List of HTML-formatted table strings
+    """
+    try:
+        rag_pipeline.retriever = vector_store.retriever
+        # Query the RAG pipeline
+        logger.info(f"Chatting with paper: {message.message}")
+        response = rag_pipeline.query(message.message)
+        # Get the retrieved documents
+        retrieved_docs = vector_store.retrieve(message.message)
+        parsed_docs    = rag_pipeline.parse_docs(retrieved_docs)
+        return {
+            'status'   : 'success',
+            'response' : response['response'],
+            'citations': parsed_docs
+        }
+    except Exception as e:
+        logger.error(f"Error chatting with paper: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/fetch-citations')
+async def fetch_citations(message: ChatMessage):
+    """
+    Fetch citations for a specific query without generating a response.
+    This is useful for retrieving only the source documents that would be used
+    to answer a query without generating the complete answer.
+    Returns:
+        - status: success or error
+        - citations: Dictionary containing three keys:
+            - texts: List of text excerpts used as citations
+            - images: List of base64-encoded image strings
+            - tables: List of HTML-formatted table strings
+    """
+    try:
+        # Get the retrieved documents
+        retrieved_docs = vector_store.retrieve(message.message)
+        parsed_docs    = rag_pipeline.parse_docs(retrieved_docs)
+        return {
+            'status'   : 'success',
+            'citations': parsed_docs
+        }
+    except Exception as e:
+        logger.error(f"Error fetching citations: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post('/api/reset-chat')
+async def reset_chat():
+    """Reset the chat and vector store."""
+    try:
+        logger.info("Resetting chat and vector store")
+        vector_store.reset()
+        rag_pipeline.retriever = vector_store.retriever
+        rag_pipeline.reset_memory()
+        return {'status': 'success', 'message': 'Chat reset successfully'}
+    except Exception as e:
+        logger.error(f"Error resetting chat: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# Serve static files
+app.mount('/static', StaticFiles(directory=ROOT_DIR / 'static', html=False), name='static')
+app.mount('/data'  , StaticFiles(directory=ROOT_DIR / 'static/data')       , name='data')
+app.mount('/'      , StaticFiles(directory=ROOT_DIR / 'static', html=True) , name='root')

src/config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Configuration settings for the arXivCSRAG application.
+"""
+import os
+import torch
+from pathlib import Path
+from dotenv import load_dotenv
+from huggingface_hub import whoami
+# Load environment variables
+load_dotenv()
+user = whoami(token=os.getenv('HF_TOKEN'))
+# Base paths
+ROOT_DIR = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+TEMP_DIR = ROOT_DIR / 'temp'
+if not TEMP_DIR.exists(): TEMP_DIR.mkdir(parents=True, exist_ok=True)
+# PDF Extraction Configuration
+PDF_EXTRACTION_CONFIG = {
+    'infer_table_structure'         : True,
+    'strategy'                      : 'hi_res',
+    'extract_image_block_types'     : ['Image'],
+    'extract_image_block_to_payload': True,
+    'chunking_strategy'             : 'by_title',
+    'max_characters'                : 10000,
+    'combine_text_under_n_chars'    : 2000,
+    'new_after_n_chars'             : 6000
+}
+# LLM & Embedding model Configuration
+MODEL_NAME      = 'gemini-2.5-flash-lite-preview-06-17'
+# EMBEDDING_MODEL = 'BAAI/bge-base-en-v1.5'
+EMBEDDING_MODEL = 'BAAI/bge-m3' # Multi-lingual BGE model
+DEVICE          = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Vector Store Configuration
+COLLECTION_NAME = 'arXiv_CS_RAG'

src/data_extraction/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Data extraction module for PDF documents.
+"""

src/data_extraction/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (237 Bytes). View file

src/data_extraction/__pycache__/extractor.cpython-310.pyc ADDED Viewed

Binary file (1.76 kB). View file

src/data_extraction/extractor.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+PDF document extraction utilities.
+"""
+from pathlib                    import Path
+from typing                     import List, Dict, Any, Union
+from unstructured.partition.pdf import partition_pdf
+from src.config import PDF_EXTRACTION_CONFIG
+def extract_from_pdf(pdf_path: Union[str, Path]) -> List[Any]:
+    """
+    Extract content from a PDF file using unstructured.
+    Args:
+        pdf_path (Union[str, Path]): Path to the PDF file
+    Returns:
+        List[Any]: List of extracted elements (text, tables, images)
+    """
+    pdf_path = Path(pdf_path) if isinstance(pdf_path, str) else pdf_path
+    if not pdf_path.exists():
+        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+    chunks = partition_pdf(filename=pdf_path, **PDF_EXTRACTION_CONFIG)
+    return chunks
+def separate_content_types(chunks: List[Any]) -> Dict[str, List[Any]]:
+    """
+    Separate the extracted content into text, images, and tables.
+    Args:
+        chunks (List[Any]): List of extracted elements from the PDF
+    Returns:
+        Dict[str, List[Any]]: Dictionary with keys 'texts', 'images', 'tables'
+    """
+    texts, images, tables = [], [], []
+    for chunk in chunks:
+        if   type(chunk).__name__ == 'Table': tables.append(chunk)
+        elif type(chunk).__name__ == 'Image': images.append(chunk)
+        elif type(chunk).__name__ == 'CompositeElement':
+            texts.append(chunk)
+            for element in chunk.metadata.orig_elements:
+                if   type(element).__name__ == 'Image': images.append(element)
+                elif type(element).__name__ == 'Table': tables.append(element)
+    return {'texts': texts, 'images': images, 'tables': tables}

src/fetcher/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+arXiv fetcher utilities module.
+"""

src/fetcher/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (219 Bytes). View file

src/fetcher/__pycache__/arxiv_fetcher.cpython-310.pyc ADDED Viewed

Binary file (3.88 kB). View file

src/fetcher/arxiv_cs_subjects.json ADDED Viewed

	@@ -0,0 +1,162 @@

+[
+  {
+    "tag": "cs.AI",
+    "name": "Artificial Intelligence"
+  },
+  {
+    "tag": "cs.CL",
+    "name": "Computation and Language"
+  },
+  {
+    "tag": "cs.CC",
+    "name": "Computational Complexity"
+  },
+  {
+    "tag": "cs.CE",
+    "name": "Computational Engineering, Finance, and Science"
+  },
+  {
+    "tag": "cs.CG",
+    "name": "Computational Geometry"
+  },
+  {
+    "tag": "cs.GT",
+    "name": "Computer Science and Game Theory"
+  },
+  {
+    "tag": "cs.CV",
+    "name": "Computer Vision and Pattern Recognition"
+  },
+  {
+    "tag": "cs.CY",
+    "name": "Computers and Society"
+  },
+  {
+    "tag": "cs.CR",
+    "name": "Cryptography and Security"
+  },
+  {
+    "tag": "cs.DS",
+    "name": "Data Structures and Algorithms"
+  },
+  {
+    "tag": "cs.DB",
+    "name": "Databases"
+  },
+  {
+    "tag": "cs.DL",
+    "name": "Digital Libraries"
+  },
+  {
+    "tag": "cs.DM",
+    "name": "Discrete Mathematics"
+  },
+  {
+    "tag": "cs.DC",
+    "name": "Distributed, Parallel, and Cluster Computing"
+  },
+  {
+    "tag": "cs.ET",
+    "name": "Emerging Technologies"
+  },
+  {
+    "tag": "cs.FL",
+    "name": "Formal Languages and Automata Theory"
+  },
+  {
+    "tag": "cs.GL",
+    "name": "General Literature"
+  },
+  {
+    "tag": "cs.GR",
+    "name": "Graphics"
+  },
+  {
+    "tag": "cs.AR",
+    "name": "Hardware Architecture"
+  },
+  {
+    "tag": "cs.HC",
+    "name": "Human-Computer Interaction"
+  },
+  {
+    "tag": "cs.IR",
+    "name": "Information Retrieval"
+  },
+  {
+    "tag": "cs.IT",
+    "name": "Information Theory"
+  },
+  {
+    "tag": "cs.LO",
+    "name": "Logic in Computer Science"
+  },
+  {
+    "tag": "cs.LG",
+    "name": "Machine Learning"
+  },
+  {
+    "tag": "cs.MS",
+    "name": "Mathematical Software"
+  },
+  {
+    "tag": "cs.MA",
+    "name": "Multiagent Systems"
+  },
+  {
+    "tag": "cs.MM",
+    "name": "Multimedia"
+  },
+  {
+    "tag": "cs.NI",
+    "name": "Networking and Internet Architecture"
+  },
+  {
+    "tag": "cs.NE",
+    "name": "Neural and Evolutionary Computing"
+  },
+  {
+    "tag": "cs.NA",
+    "name": "Numerical Analysis"
+  },
+  {
+    "tag": "cs.OS",
+    "name": "Operating Systems"
+  },
+  {
+    "tag": "cs.OH",
+    "name": "Other Computer Science"
+  },
+  {
+    "tag": "cs.PF",
+    "name": "Performance"
+  },
+  {
+    "tag": "cs.PL",
+    "name": "Programming Languages"
+  },
+  {
+    "tag": "cs.RO",
+    "name": "Robotics"
+  },
+  {
+    "tag": "cs.SI",
+    "name": "Social and Information Networks"
+  },
+  {
+    "tag": "cs.SE",
+    "name": "Software Engineering"
+  },
+  {
+    "tag": "cs.SD",
+    "name": "Sound"
+  },
+  {
+    "tag": "cs.SC",
+    "name": "Symbolic Computation"
+  },
+  {
+    "tag": "cs.SY",
+    "name": "Systems and Control"
+  }
+]

src/fetcher/arxiv_fetcher.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import arxiv
+import urllib.request
+from pathlib  import Path
+from dateutil import parser
+from typing   import List, Dict, Any, Optional
+from utils.setup_logger import setup_logger
+from src.config import TEMP_DIR
+# Configure logging
+logger = setup_logger(__name__)
+class ArxivFetcher:
+    def __init__(self):
+        self.client = arxiv.Client()
+    def fetch_papers(self,
+                     subject_tags: List[str] = None,
+                     start_date  : str       = None,
+                     end_date    : str       = None,
+                     max_results : int       = 10,
+                     query       : str       = None) -> List[Dict[str, Any]]:
+        """
+        Fetches papers from arXiv based on subject tags and date range.
+        Args:
+            subject_tags (list): List of subject tags to filter papers by
+            start_date    (str): Start date in YYYY-MM-DD format
+            end_date      (str): End date in YYYY-MM-DD format
+            query         (str): Search query for text-based search
+            max_results   (int): Maximum number of results to return
+        Returns:
+            list: List of paper dictionaries with metadata
+        """
+        # Search query
+        if not subject_tags: filter_query = 'cat:cs.*'                                          # Default to all CS tags
+        else               : filter_query = ' OR '.join([f"cat:{tag}" for tag in subject_tags]) # Query with selected tags
+        if not query: search_query = ''
+        else        : search_query = ' AND (' + ' AND '.join([f"(ti:{q} OR abs:{q})" for q in query.split()]) + ')' # Search by title or abstract
+        final_query = f"({filter_query}){search_query}"
+        logger.info(f"Fetching papers with query: {final_query}")
+        # Search object
+        search = arxiv.Search(
+            query       = final_query,
+            max_results = max_results,
+            sort_by     = arxiv.SortCriterion.SubmittedDate
+        )
+        try:
+            results = list(self.client.results(search))
+            # Filter by date
+            if start_date or end_date:
+                filtered_results = []
+                start_date_obj   = parser.parse(start_date).date() if start_date else None
+                end_date_obj     = parser.parse(end_date).date()   if end_date   else None
+                for paper in results:
+                    paper_date = paper.published.date()
+                    if start_date_obj and paper_date < start_date_obj: continue
+                    if end_date_obj   and paper_date > end_date_obj  : continue
+                    filtered_results.append(paper)
+                results = filtered_results
+            # Convert to dictionary format with required metadata
+            papers = []
+            for paper in results:
+                papers.append({
+                    'title'           : paper.title,
+                    'authors'         : [author.name for author in paper.authors],
+                    'published'       : paper.published.strftime('%Y-%m-%d'),
+                    'updated'         : paper.updated.strftime('%Y-%m-%d') if paper.updated else None,
+                    'arxiv_id'        : paper.get_short_id(),
+                    'pdf_url'         : paper.pdf_url,
+                    'entry_id'        : paper.entry_id,
+                    'abstract'        : paper.summary,
+                    'categories'      : paper.categories,
+                    'primary_category': paper.primary_category
+                })
+            return papers
+        except Exception as e:
+            print(f"Error fetching papers: {e}")
+            return []
+    def download_paper(self, paper_id: str) -> Optional[Path]:
+        """
+        Downloads a paper's PDF from arXiv.
+        Args:
+            paper_id (str): The arXiv ID of the paper
+        Returns:
+            Optional[Path]: Path to the downloaded PDF file, or None if download failed
+        """
+        try:
+            # Create the filename
+            filename = f"{paper_id.replace('/', '_')}.pdf"
+            filepath = TEMP_DIR / filename
+            if filepath.exists():
+                return filepath
+            # Download the PDF
+            pdf_url = f"https://arxiv.org/pdf/{paper_id}"
+            urllib.request.urlretrieve(pdf_url, filepath)
+            return filepath
+        except Exception as e:
+            print(f"Error downloading paper {paper_id}: {e}")
+            return None

src/processors/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Content processors module for text, tables, and images.
+"""

src/processors/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (246 Bytes). View file

src/processors/__pycache__/image_processor.cpython-310.pyc ADDED Viewed

Binary file (2.21 kB). View file

src/processors/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (5.96 kB). View file

src/processors/__pycache__/table_processor.cpython-310.pyc ADDED Viewed

Binary file (2.22 kB). View file

src/processors/__pycache__/text_processor.cpython-310.pyc ADDED Viewed

Binary file (2.18 kB). View file

src/processors/image_processor.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Image content processor for summarization.
+"""
+from typing import List, Any, Callable
+from langchain_google_genai        import ChatGoogleGenerativeAI
+from langchain_core.prompts        import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from src.config             import MODEL_NAME
+from src.processors.prompts import IMAGE_SUMMARY_PROMPT
+class ImageProcessor:
+    """Image content processor for summarization."""
+    def __init__(self, model_name: str = MODEL_NAME):
+        """
+        Initialize the image processor.
+        Args:
+            model_name (str): Name of the LLM model to use
+        """
+        self.llm   = ChatGoogleGenerativeAI(model=model_name)
+        self.chain = self._create_summary_chain()
+    def _create_summary_chain(self) -> Callable:
+        """
+        Create the image summarization chain.
+        Returns:
+            Callable: The image summarization chain
+        """
+        messages = [(
+            'user',
+            [
+                {'type': 'text'     , 'text': IMAGE_SUMMARY_PROMPT},
+                {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}
+            ]
+        )]
+        return (
+            ChatPromptTemplate.from_messages(messages)
+            | self.llm
+            | StrOutputParser()
+        )
+    def process(self, images: List[Any]) -> List[str]:
+        """
+        Process and summarize image elements.
+        Args:
+            images (List[Any]): List of image elements to summarize
+        Returns:
+            List[str]: List of image summaries
+        """
+        summaries = []
+        for image in images:
+            summary = self.chain.invoke({'image': image.metadata.image_base64})
+            summaries.append(summary)
+        return summaries

src/processors/prompts.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Prompt templates for the LLM processors.
+"""
+# Text summarization prompt
+TEXT_SUMMARY_PROMPT = """
+# ROLE
+You are a highly specialized text processing engine. Your only function is to describe and summarize text.
+# INSTRUCTIONS
+You will be given a piece of text as input. Your task is to:
+1.  Identify the main subject and purpose of the text (description).
+2.  Extract and synthesize the most important points and key ideas (summary).
+3.  Combine these into a single, cohesive response.
+# STRICT RULES
+- **DO NOT** use any conversational language, introductions, or concluding remarks. (e.g., "Here is a summary:", "The text discusses...", "In summary...", "The text describes...").
+- **DO NOT** refer to yourself, your instructions, or your role as an AI.
+- **DO NOT** add any information, examples, or opinions not found directly in the input text.
+- **DO NOT** apologize, express uncertainty, or ask for clarification.
+- Your output **MUST** be limited strictly to the description and summary of the text. There should be no other text, characters, or formatting.
+# TASK
+Analyze the following input text and generate ONLY the description and summary of its key ideas. Do not include any additional text, explanations, or formatting.
+--- INPUT TEXT ---
+{text}
+--- OUTPUT TEXT ---
+"""
+# Table summarization prompt
+TABLE_SUMMARY_PROMPT = """
+# ROLE
+You are a highly specialized data processing engine. Your only function is to interpret and summarize data from tables.
+# INSTRUCTIONS
+You will be given a table in HTML format as input. Your task is to:
+1.  Interpret the HTML structure (e.g., `<table>`, `<th>`, `<tr>`, `<td>`) to understand the data's organization, columns, and rows.
+2.  Identify the main subject of the table. What data is it presenting? (This is the description).
+3.  Summarize the key insights, trends, or significant relationships presented in the data. (This is the summary). **DO NOT** simply list the data row by row.
+4.  Combine these into a single, cohesive text response.
+# STRICT RULES
+- **DO NOT** use any conversational language, introductions, or concluding remarks. (e.g., "Here is a summary:", "The table discusses...", "In summary...", "The table describes...").
+- **DO NOT** refer to yourself, your instructions, or your role as an AI.
+- **DO NOT** add any information or interpretations not directly supported by the data in the table.
+- **DO NOT** apologize, express uncertainty, or ask for clarification.
+- Your output **MUST** be limited strictly to the description and summary of the table's data. There should be no other text, characters, or formatting.
+# TASK
+Analyze the following HTML table and generate ONLY the description and summary of its key ideas and data. Do not include any additional text, explanations, or formatting.
+--- INPUT TABLE (HTML) ---
+{table}
+--- OUTPUT TEXT ---
+"""
+# Image summarization prompt
+IMAGE_SUMMARY_PROMPT = """
+# ROLE
+You are a highly specialized Visual Analysis Engine. Your sole function is to analyze the provided image and extract all relevant information into a concise text description.
+# INSTRUCTIONS
+You will be given an image as input. Your task is to:
+1.  First, identify the type of image (e.g., photograph, bar chart, line graph, diagram, flowchart).
+2.  Based on the image type, extract all key visual and textual information:
+    - **For charts or graphs:** Identify the title, axis labels (X and Y), units, and legend. Summarize the data trends, key values (highs, lows, significant points), and the primary relationship the data illustrates.
+    - **For diagrams or flowcharts:** Identify all components, labels, and connectors (like arrows). Describe the process, hierarchy, or system being shown from start to finish.
+    - **For photographs or scenes:** Describe the main subject(s), objects, setting/environment, any visible text, and the key actions taking place.
+3.  Synthesize all extracted information into a single, comprehensive summary. The goal is to create a text-based representation of the image that captures all its important content.
+# STRICT RULES
+- **DO NOT** use any conversational language, introductions, or concluding remarks (e.g., "The image shows...", "In this picture...").
+- **DO NOT** refer to yourself, your instructions, or your role as an AI.
+- **DO NOT** speculate or infer information beyond what is visually present. Do not guess emotions, intentions, or events happening outside the frame unless explicitly supported by visual cues.
+- **DO NOT** express personal opinions or make subjective aesthetic judgments about the image.
+- **DO NOT** apologize, express uncertainty, or ask for clarification.
+- Your output **MUST** be a single block of text containing only the comprehensive description and summary. There should be no other text, characters, or formatting.
+# TASK
+Analyze the following image and generate ONLY a comprehensive text description of its content and key data. Do not include any additional text, explanations, or formatting.
+"""
+# RAG system message
+RAG_SYSTEM_MESSAGE = """
+You are a helpful AI assistant with conversational memory. Your task is to answer the user's question based on the provided context and your memory of the conversation history.
+- If the information needed to answer the question is not in the context, you MUST respond with the exact phrase: `Sorry 🥹, I don't have enough information to answer this question.` or 'Xin lỗi 🥹, tôi không đủ thông tin để trả lời câu hỏi này.', based on the language of the question.
+- You should use the conversation history to provide more coherent and contextually relevant responses.
+- When referencing previous exchanges, do so naturally within your response.
+- Your entire answer must be grounded in the provided text and conversation history.
+- Format your response in Markdown.
+Below is the context provided to you:
+"""

src/processors/table_processor.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Table content processor for summarization.
+"""
+from typing import List, Any, Callable
+from langchain_google_genai        import ChatGoogleGenerativeAI
+from langchain_core.prompts        import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from src.config             import MODEL_NAME
+from src.processors.prompts import TABLE_SUMMARY_PROMPT
+class TableProcessor:
+    """Table content processor for summarization."""
+    def __init__(self, model_name: str = MODEL_NAME):
+        """
+        Initialize the table processor.
+        Args:
+            model_name (str): Name of the LLM model to use
+        """
+        self.llm   = ChatGoogleGenerativeAI(model=model_name)
+        self.chain = self._create_summary_chain()
+    def _create_summary_chain(self) -> Callable:
+        """
+        Create the table summarization chain.
+        Returns:
+            Callable: The table summarization chain
+        """
+        return (
+            {'table': lambda x: x}
+            | ChatPromptTemplate.from_template(TABLE_SUMMARY_PROMPT)
+            | self.llm
+            | StrOutputParser()
+        )
+    def process(self, tables: List[Any]) -> List[str]:
+        """
+        Process and summarize table elements.
+        Args:
+            tables (List[Any]): List of table elements to summarize
+        Returns:
+            List[str]: List of table summaries
+        """
+        summaries = []
+        for table in tables:
+            summary = self.chain.invoke(table.metadata.text_as_html)
+            summaries.append(summary)
+        return summaries

src/processors/text_processor.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Text content processor for summarization.
+"""
+from typing import List, Any, Callable
+from langchain_google_genai        import ChatGoogleGenerativeAI
+from langchain_core.prompts        import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from src.config             import MODEL_NAME
+from src.processors.prompts import TEXT_SUMMARY_PROMPT
+class TextProcessor:
+    """Text content processor for summarization."""
+    def __init__(self, model_name: str = MODEL_NAME):
+        """
+        Initialize the text processor.
+        Args:
+            model_name (str): Name of the LLM model to use
+        """
+        self.llm   = ChatGoogleGenerativeAI(model=model_name)
+        self.chain = self._create_summary_chain()
+    def _create_summary_chain(self) -> Callable:
+        """
+        Create the text summarization chain.
+        Returns:
+            Callable: The text summarization chain
+        """
+        return (
+            {'text': lambda x: x}
+            | ChatPromptTemplate.from_template(TEXT_SUMMARY_PROMPT)
+            | self.llm
+            | StrOutputParser()
+        )
+    def process(self, texts: List[Any]) -> List[str]:
+        """
+        Process and summarize text elements.
+        Args:
+            texts (List[Any]): List of text elements to summarize
+        Returns:
+            List[str]: List of text summaries
+        """
+        summaries = []
+        for text in texts:
+            summary = self.chain.invoke(text.text)
+            summaries.append(summary)
+        return summaries

src/rag/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+RAG pipeline implementation module.
+"""

src/rag/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (219 Bytes). View file

src/rag/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (4.93 kB). View file

src/rag/pipeline.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+RAG pipeline implementation.
+"""
+from typing   import Dict, List, Any, Callable
+from operator import itemgetter
+from langchain_google_genai            import ChatGoogleGenerativeAI
+from langchain_core.prompts            import ChatPromptTemplate
+from langchain_core.messages           import SystemMessage, HumanMessage
+from langchain_core.runnables          import RunnablePassthrough, RunnableLambda
+from langchain_core.output_parsers     import StrOutputParser
+from langchain.retrievers.multi_vector import MultiVectorRetriever
+from langchain.memory.summary          import ConversationSummaryMemory
+from src.config             import MODEL_NAME
+from src.processors.prompts import RAG_SYSTEM_MESSAGE
+class RAGPipeline:
+    """RAG pipeline implementation."""
+    def __init__(self, retriever: MultiVectorRetriever, model_name: str = MODEL_NAME):
+        """
+        Initialize the RAG pipeline.
+        Args:
+            retriever (MultiVectorRetriever): The document retriever
+            model_name                 (str): Name of the LLM model to use
+        """
+        self.retriever = retriever
+        self.llm       = ChatGoogleGenerativeAI(model=model_name)
+        self.rag_chain = self._create_rag_chain()
+        self.memory    = ConversationSummaryMemory(
+            llm=self.llm,
+            memory_key="chat_history",
+            return_messages=True,
+            input_key="question",
+            output_key="response"
+        )
+    def parse_docs(self, docs: List[Any]) -> Dict[str, List[Any]]:
+        """
+        Parse the retrieved documents into text, image, and table lists.
+        Args:
+            docs (List[Any]): List of retrieved documents
+        Returns:
+            Dict[str, List[Any]]: Dictionary with keys 'texts', 'images', 'tables'
+        """
+        parsed_texts, parsed_images, parsed_tables = [], [], []
+        for doc in docs:
+            if   type(doc).__name__ == 'Table'           : parsed_tables.append(doc.metadata.text_as_html)
+            elif type(doc).__name__ == 'Image'           : parsed_images.append(doc.metadata.image_base64)
+            elif type(doc).__name__ == 'CompositeElement': parsed_texts.append(doc.text)
+        return {'texts': parsed_texts, 'images': parsed_images, 'tables': parsed_tables}
+    def _build_prompt(self, kwargs: Dict[str, Any]) -> ChatPromptTemplate:
+        """
+        Build the prompt template for the RAG query.
+        Args:
+            kwargs (Dict[str, Any]): Dictionary with keys 'context', 'question', and 'chat_history'
+        Returns:
+            ChatPromptTemplate: The chat prompt template
+        """
+        context      = kwargs['context']
+        question     = kwargs['question']
+        chat_history = kwargs.get('chat_history', [])
+        messages = [SystemMessage(content=RAG_SYSTEM_MESSAGE)]
+        # Add conversation history if available
+        if chat_history:
+            messages.extend(chat_history)
+        for txt in context['texts'] : messages.append(HumanMessage(content=[{'type': 'text', 'text': f"[TEXT]:\n{txt}"}]))
+        for tbl in context['tables']: messages.append(HumanMessage(content=[{'type': 'text', 'text': f"[TABLE]:\n```html\n{tbl}\n```"}]))
+        for img in context['images']:
+            messages.append(
+                HumanMessage(content=[{'type': 'text'     , 'text': f"[IMAGE]:\n"},
+                                      {'type': 'image_url', 'image_url': {'url': f"data:image/jpeg;base64,{img}"}}])
+            )
+        messages.append(
+            HumanMessage(content=[{'type': 'text',
+                                   'text': f"Based on the above contexts and our conversation history, answer the question: {question}"}])
+        )
+        return ChatPromptTemplate.from_messages(messages)
+    def _create_rag_chain(self) -> Callable:
+        """
+        Create the RAG chain.
+        Returns:
+            Callable: The RAG chain
+        """
+        return (
+            {
+                'context'     : itemgetter('question') | RunnableLambda(lambda q: f"query: {q}") | self.retriever | RunnableLambda(self.parse_docs),
+                'question'    : itemgetter('question'),
+                'chat_history': itemgetter('chat_history')
+            }
+            | RunnablePassthrough().assign(
+                response=(
+                    RunnableLambda(self._build_prompt)
+                    | self.llm
+                    | StrOutputParser()
+                )
+            )
+        )
+    def query(self, question: str) -> Dict[str, Any]:
+        """
+        Query the RAG pipeline.
+        Args:
+            question (str): The question to answer
+        Returns:
+            Dict[str, Any]: Dictionary with keys 'question', 'context', and 'response'
+        """
+        # Get chat history from memory
+        chat_history = self.memory.load_memory_variables({})
+        # Execute the query with the chat history
+        result = self.rag_chain.invoke({
+            'question': question,
+            'chat_history': chat_history.get('chat_history', [])
+        })
+        # Update memory with the new interaction
+        self.memory.save_context(
+            {"question": question},
+            {"response": result['response']}
+        )
+        return result
+    def reset_memory(self):
+        """Reset the conversation memory."""
+        self.memory.clear()

src/storage/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Vector storage and retrieval module.
+"""

src/storage/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (224 Bytes). View file

src/storage/__pycache__/vectorstore.cpython-310.pyc ADDED Viewed

Binary file (5.36 kB). View file

src/storage/vectorstore.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Vector storage and retrieval implementation.
+"""
+import uuid
+from typing import List, Any
+from langchain_chroma                  import Chroma
+from langchain.storage                 import InMemoryStore
+from langchain.schema.document         import Document
+from langchain_huggingface             import HuggingFaceEmbeddings
+from langchain.retrievers.multi_vector import MultiVectorRetriever
+from src.config import EMBEDDING_MODEL, DEVICE, COLLECTION_NAME
+class VectorStore:
+    """Vector storage and retrieval implementation."""
+    def __init__(self, collection_name: str = COLLECTION_NAME, embedding_model: str = EMBEDDING_MODEL):
+        """
+        Initialize the vector store.
+        Args:
+            collection_name (str): Name of the vector store collection
+            embedding_model (str): Name of the embedding model to use
+        """
+        self.embedding_function = self._create_embedding_function(embedding_model)
+        self.vector_store       = self._create_vector_store(collection_name)
+        self.doc_store          = InMemoryStore()
+        self.id_key             = 'doc_id'
+        self.retriever          = self._create_retriever()
+    def _create_embedding_function(self, model_name: str) -> HuggingFaceEmbeddings:
+        """
+        Create an embedding function.
+        Args:
+            model_name (str): Name of the embedding model
+        Returns:
+            HuggingFaceEmbeddings: The embedding function
+        """
+        return HuggingFaceEmbeddings(
+            model_name    = model_name,
+            model_kwargs  = {'device': DEVICE},
+            encode_kwargs = {'normalize_embeddings': True} # Change this if use an already normalized model
+        )
+    def _create_vector_store(self, collection_name: str) -> Chroma:
+        """
+        Create a vector store.
+        Args:
+            collection_name (str): Name of the vector store collection
+        Returns:
+            Chroma: The vector store
+        """
+        return Chroma(
+            collection_name    = collection_name,
+            embedding_function = self.embedding_function,
+        )
+    def _create_retriever(self) -> MultiVectorRetriever:
+        """
+        Create a multi-vector retriever.
+        Returns:
+            MultiVectorRetriever: The retriever
+        """
+        return MultiVectorRetriever(
+            vectorstore = self.vector_store,
+            docstore    = self.doc_store,
+            id_key      = self.id_key,
+        )
+    def add_to_retriever(self, data: List[Any], data_summaries: List[str]) -> None:
+        """
+        Add data and summaries to the retriever.
+        Args:
+            data           (List[Any]): List of data elements
+            data_summaries (List[str]): List of data summaries
+        """
+        if not data:
+            return
+        if len(data) != len(data_summaries):
+            raise ValueError(f"Length mismatch: {len(data)} data but {len(data_summaries)} summaries")
+        ids = [str(uuid.uuid4()) for _ in range(len(data))]
+        summaries = [
+            Document(
+                page_content = f"passage: {summary}", # Change this to suit with model requirements if use a different model
+                metadata     = {self.id_key: i}
+            )
+            for i, summary in zip(ids, data_summaries)
+        ]
+        self.retriever.vectorstore.add_documents(summaries)
+        self.retriever.docstore.mset(list(zip(ids, data)))
+    def add_contents(self,
+                     texts : List[Any], text_summaries : List[str],
+                     tables: List[Any], table_summaries: List[str],
+                     images: List[Any], image_summaries: List[str]) -> None:
+        """
+        Add all content types and their summaries to the retriever.
+        Args:
+            texts           (List[Any]): List of text elements
+            text_summaries  (List[str]): List of text summaries
+            tables          (List[Any]): List of table elements
+            table_summaries (List[str]): List of table summaries
+            images          (List[Any]): List of image elements
+            image_summaries (List[str]): List of image summaries
+        """
+        self.add_to_retriever(texts , text_summaries)
+        self.add_to_retriever(tables, table_summaries)
+        self.add_to_retriever(images, image_summaries)
+    def reset(self) -> None:
+        """Reset the vector store and document store."""
+        try:
+            self.vector_store.reset_collection()
+        except Exception as e:
+            raise RuntimeError(f"Failed to reset vector store: {e}")
+        # self.vector_store = self._create_vector_store(COLLECTION_NAME)
+        self.doc_store    = InMemoryStore()
+        self.retriever    = self._create_retriever()
+    def retrieve(self, query: str) -> List[Any]:
+        """
+        Retrieve relevant documents for a query.
+        Args:
+            query (str): The query string
+        Returns:
+            List[Any]: List of retrieved documents
+        """
+        return self.retriever.invoke(query)

static/css/modern-styles.css ADDED Viewed

	@@ -0,0 +1,1177 @@

+/* Modern Styles for arXivCSRAG Application - 2025 Edition */
+/* Color Scheme & Variables */
+:root {
+  /* Core colors */
+  --primary: #4361ee;
+  --primary-light: #4cc9f0;
+  --primary-dark: #3a0ca3;
+  --secondary: #4b4d63;
+  --accent: #f72585;
+  /* Neutral colors */
+  --background: #f8f9fd;
+  --surface: #ffffff;
+  --surface-variant: #f0f2f9;
+  /* Text colors */
+  --text-primary: #1e1e2f;
+  --text-secondary: #4b4d63;
+  --text-tertiary: #6e7191;
+  --text-on-primary: #ffffff;
+  --text-on-accent: #ffffff;
+  /* Status colors */
+  --success: #06d6a0;
+  --warning: #ffd166;
+  --error: #ef476f;
+  --info: #118ab2;
+  /* Border colors */
+  --border: #e2e8f0;
+  --border-hover: #cbd5e1;
+  /* Shadows */
+  --shadow-sm: 0 1px 2px rgba(30, 30, 47, 0.05);
+  --shadow-md: 0 4px 8px rgba(30, 30, 47, 0.08);
+  --shadow-lg: 0 8px 16px rgba(30, 30, 47, 0.1);
+  /* Spacing */
+  --space-1: 0.25rem;
+  --space-2: 0.5rem;
+  --space-3: 0.75rem;
+  --space-4: 1rem;
+  --space-5: 1.5rem;
+  --space-6: 2rem;
+  --space-8: 3rem;
+  /* Dimensions */
+  --header-height: 70px;
+  --border-radius-sm: 6px;
+  --border-radius-md: 8px;
+  --border-radius-lg: 12px;
+  --border-radius-xl: 16px;
+  /* Typography */
+  --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
+  --font-mono: 'JetBrains Mono', SFMono-Regular, Menlo, Monaco, Consolas, monospace;
+  /* Transitions */
+  --transition-fast: 150ms cubic-bezier(0.4, 0, 0.2, 1);
+  --transition-normal: 250ms cubic-bezier(0.4, 0, 0.2, 1);
+  /* Color variable for RGB usage */
+  --primary-rgb: 67, 97, 238;
+}
+/* Base Reset */
+*, *::before, *::after {
+  box-sizing: border-box;
+  margin: 0;
+  padding: 0;
+}
+html {
+  font-size: 16px;
+  scroll-behavior: smooth;
+}
+body {
+  font-family: var(--font-sans);
+  line-height: 1.5;
+  color: var(--text-primary);
+  background-color: var(--background);
+  -webkit-font-smoothing: antialiased;
+  -moz-osx-font-smoothing: grayscale;
+  overflow-x: hidden;
+  margin: 0;
+  padding: 0;
+  min-height: 100vh;
+}
+/* Typography */
+h1, h2, h3, h4, h5, h6 {
+  font-weight: 600;
+  line-height: 1.2;
+  color: var(--text-primary);
+  margin-bottom: var(--space-4);
+}
+h1 {
+  font-size: 2.25rem;
+  letter-spacing: -0.025em;
+}
+h2 {
+  font-size: 1.75rem;
+  letter-spacing: -0.0125em;
+}
+h3 {
+  font-size: 1.5rem;
+}
+h4 {
+  font-size: 1.25rem;
+}
+p {
+  margin-bottom: var(--space-4);
+}
+a {
+  color: var(--primary);
+  text-decoration: none;
+  transition: color var(--transition-fast);
+}
+a:hover {
+  color: var(--primary-dark);
+}
+/* Layout */
+.app-container {
+  max-width: 1600px;
+  margin: 0 auto;
+  padding: 0 var(--space-4);
+  min-height: 100vh;
+  display: flex;
+  flex-direction: column;
+}
+/* Header */
+header {
+  height: var(--header-height);
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  /* padding: 0 var(--space-5); */
+  border-bottom: 1px solid var(--border);
+  background-color: var(--surface);
+}
+.header-content {
+  display: flex;
+  align-items: center;
+  gap: var(--space-3);
+}
+.header-content h1 {
+  color: var(--primary);
+  margin-bottom: 0;
+  font-weight: 700;
+}
+.header-content p {
+  color: var(--text-tertiary);
+  margin-bottom: 0;
+  font-size: 3rem;
+  position: relative;
+  top: 2px;
+}
+/* Main Content */
+main {
+  padding: var(--space-5) 0;
+  flex: 1;
+}
+.content-row {
+  display: flex;
+  gap: var(--space-5);
+  height: calc(100vh - var(--header-height) - var(--space-5) * 2 - 60px); /* 60px for footer */
+}
+.panel {
+  background-color: var(--surface);
+  border-radius: var(--border-radius-lg);
+  box-shadow: var(--shadow-md);
+  overflow: hidden;
+  display: flex;
+  flex-direction: column;
+  height: 100%;
+}
+.left-panel {
+  flex: 1;
+  min-width: 0; /* Prevents flex items from overflowing */
+}
+.right-panel {
+  flex: 1;
+  min-width: 0;
+}
+/* Nav Tabs */
+.nav-tabs {
+  display: flex;
+  border-bottom: 1px solid var(--border);
+  background-color: var(--surface-variant);
+  min-height: 76px; /* Increase height while maintaining flexibility */
+  align-items: center; /* Vertically center the nav links */
+}
+.nav-link {
+  /* padding: var(--space-4) var(--space-5); */
+  padding-top: 0.8rem;
+  padding-bottom: 0.8rem;
+  padding-left: 0.6rem;
+  padding-right: 0.6rem;
+  font-size: 1rem;
+  font-weight: 500;
+  color: var(--text-tertiary);
+  border: none;
+  background: transparent;
+  cursor: pointer;
+  position: relative;
+  transition: color var(--transition-fast);
+  height: 100%; /* Make links fill the height of nav-tabs */
+  /* display: flex; */
+  align-items: center;
+}
+.nav-link:hover {
+  color: var(--primary);
+}
+.nav-link.active {
+  color: var(--primary);
+  font-weight: 600;
+}
+.nav-link.active::after {
+  content: '';
+  position: absolute;
+  bottom: 0;
+  left: 0;
+  width: 100%;
+  height: 2px;
+  background-color: var(--primary);
+}
+/* Tab Content */
+.tab-content {
+  flex: 1;
+  overflow: auto;
+  padding: var(--space-5);
+}
+.tab-pane {
+  display: none;
+}
+.tab-pane.active {
+  display: block;
+}
+/* Search Form */
+.search-section {
+  margin-bottom: var(--space-5);
+}
+.search-section h2 {
+  font-size: 1.25rem;
+  margin-bottom: var(--space-4);
+  color: var(--text-primary);
+}
+.filters-grid {
+  display: grid;
+  grid-template-columns: 1.4fr 0.6fr;
+  /* gap: var(--space-4); */
+  /* row-gap: 0.5rem; */
+  column-gap: 1rem;
+  /* margin-bottom: var(--space-5); */
+  margin-bottom: 0.3rem;
+}
+.filter-group {
+  margin-bottom: var(--space-4);
+}
+.filter-group label {
+  display: block;
+  margin-bottom: var(--space-2);
+  font-weight: 500;
+  font-size: 0.875rem;
+  color: var(--text-secondary);
+}
+.form-control {
+  width: 100%;
+  padding: var(--space-3);
+  border: 1px solid var(--border);
+  border-radius: var(--border-radius-sm);
+  font-family: var(--font-sans);
+  font-size: 0.9375rem;
+  background-color: var(--surface);
+  color: var(--text-primary);
+  transition: border-color var(--transition-fast), box-shadow var(--transition-fast);
+}
+.form-control:focus {
+  outline: none;
+  border-color: var(--primary-light);
+  box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.15);
+}
+.form-select {
+  appearance: none;
+  background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' viewBox='0 0 24 24' fill='none' stroke='%234b4d63' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'%3E%3C/polyline%3E%3C/svg%3E");
+  background-repeat: no-repeat;
+  background-position: right 0.75rem center;
+  background-size: 1rem;
+  padding-right: 2.5rem;
+}
+.date-range-inputs {
+  display: flex;
+  gap: var(--space-3);
+  flex-direction: column;
+}
+.btn {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  padding: var(--space-3) var(--space-4);
+  font-weight: 500;
+  font-size: 0.9375rem;
+  border-radius: var(--border-radius-sm);
+  border: none;
+  cursor: pointer;
+  transition: background-color var(--transition-fast), transform var(--transition-fast);
+}
+.btn:active {
+  transform: translateY(1px);
+}
+.btn-primary {
+  background-color: var(--primary);
+  color: var(--text-on-primary);
+}
+.btn-primary:hover {
+  background-color: var(--primary-dark);
+}
+.btn-secondary {
+  background-color: var(--surface-variant);
+  color: var(--text-secondary);
+}
+.btn-secondary:hover {
+  background-color: var(--border);
+}
+.btn-outline {
+  background-color: transparent;
+  border: 1px solid var(--border);
+  color: var(--text-secondary);
+}
+.btn-outline:hover {
+  background-color: var(--surface-variant);
+}
+.btn i {
+  margin-right: var(--space-2);
+}
+.search-btn {
+  width: 100%;
+  /* margin-top: var(--space-4); */
+}
+.upload-section {
+  border-top: 1px solid var(--border);
+  padding-top: var(--space-4);
+  margin-top: 1rem;
+}
+.upload-section p {
+  margin-bottom: var(--space-3);
+  font-size: 0.9375rem;
+  font-weight: 500;
+}
+.upload-container {
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-3);
+}
+.file-input-wrapper {
+  position: relative;
+  overflow: hidden;
+  display: inline-block;
+  width: 100%;
+}
+.file-input-button {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  background-color: var(--surface-variant);
+  color: var(--primary);
+  border: 2px dashed var(--primary-light);
+  border-radius: var(--border-radius-sm);
+  padding: 12px 16px;
+  font-weight: 500;
+  cursor: pointer;
+  transition: all 0.2s ease;
+  width: 100%;
+  justify-content: center;
+}
+.file-input-button:hover {
+  background-color: rgba(67, 97, 238, 0.05);
+  border-color: var(--primary);
+}
+.file-input-button i {
+  font-size: 1.2rem;
+}
+.file-name-display {
+  margin-top: 8px;
+  padding: 8px 12px;
+  background-color: var(--surface-variant);
+  border-radius: var(--border-radius-sm);
+  font-size: 0.875rem;
+  display: none;
+  align-items: center;
+  gap: 8px;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  border: 1px solid var(--border);
+}
+.file-name-display.active {
+  display: flex;
+}
+.file-name-display i {
+  color: var(--success);
+  flex-shrink: 0;
+  font-size: 1rem;
+}
+.file-name-display span {
+  overflow: hidden;
+  text-overflow: ellipsis;
+  font-weight: 500;
+}
+#pdf-upload {
+  position: absolute;
+  left: 0;
+  top: 0;
+  opacity: 0;
+  width: 100%;
+  height: 100%;
+  cursor: pointer;
+}
+#upload-button {
+  width: 100%;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  gap: 8px;
+  padding: 10px 16px;
+}
+#upload-button.file-selected {
+  background-color: var(--success);
+  border-color: var(--success);
+}
+#upload-button i {
+  font-size: 1.1rem;
+}
+/* Results List */
+.results-container {
+  padding-top: var(--space-2);
+}
+.no-results {
+  text-align: center;
+  color: var(--text-tertiary);
+  padding: var(--space-8) 0;
+}
+.paper-item {
+  border: 1px solid var(--border);
+  border-radius: var(--border-radius-md);
+  padding: var(--space-4);
+  margin-bottom: var(--space-4);
+  cursor: pointer;
+  transition: border-color var(--transition-normal), box-shadow var(--transition-normal), transform var(--transition-normal);
+}
+.paper-item:hover {
+  border-color: var(--primary-light);
+  box-shadow: var(--shadow-md);
+  transform: translateY(-2px);
+}
+.paper-item-header {
+  display: flex;
+  justify-content: space-between;
+  margin-bottom: var(--space-3);
+}
+.paper-title {
+  font-weight: 600;
+  color: var(--primary);
+  margin-bottom: var(--space-2);
+  font-size: 1.0625rem;
+  flex: 1;
+  margin-right: var(--space-3); /* Tạo khoảng cách với date */
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+}
+.paper-date {
+  color: var(--text-tertiary);
+  font-size: 0.875rem;
+}
+.paper-authors {
+  margin-bottom: var(--space-3);
+  font-size: 0.9375rem;
+  color: var(--text-secondary);
+}
+.paper-categories {
+  display: flex;
+  flex-wrap: wrap;
+  gap: var(--space-2);
+  margin-top: var(--space-3);
+}
+.paper-category {
+  background-color: var(--surface-variant);
+  padding: var(--space-1) var(--space-2);
+  border-radius: var(--border-radius-sm);
+  font-size: 0.75rem;
+  color: var(--text-tertiary);
+}
+/* Chat Interface */
+.chat-panel {
+  display: flex;
+  flex-direction: column;
+  height: 100%;
+}
+.chat-header {
+  padding: var(--space-4) var(--space-5);
+  border-bottom: 1px solid var(--border);
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  background-color: var(--surface-variant);
+}
+.chat-header h2 {
+  margin-bottom: 0;
+  font-size: 1.25rem;
+}
+.chat-messages {
+  flex: 1;
+  overflow-y: auto;
+  padding: var(--space-5);
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-4);
+}
+.system-message {
+  text-align: center;
+  /* padding: var(--space-8) var(--space-4); */
+  color: var(--text-tertiary);
+}
+.message {
+  max-width: 85%;
+  word-wrap: break-word;
+}
+.user-message {
+  align-self: flex-end;
+  background-color: var(--primary);
+  color: var(--text-on-primary);
+  border-radius: var(--border-radius-lg) var(--border-radius-lg) 0 var(--border-radius-lg);
+  padding: var(--space-3) var(--space-4);
+  box-shadow: var(--shadow-sm);
+}
+.bot-message {
+  align-self: flex-start;
+  background-color: var(--surface-variant);
+  border-radius: var(--border-radius-lg) var(--border-radius-lg) var(--border-radius-lg) 0;
+  padding: var(--space-3) var(--space-4);
+  box-shadow: var(--shadow-sm);
+}
+.citations {
+  margin-top: var(--space-2);
+  font-size: 0.75rem;
+  color: var(--text-tertiary);
+  border-top: 1px solid var(--border);
+  padding-top: var(--space-2);
+}
+.citations-header {
+  display: flex;
+  justify-content: flex-end;
+  align-items: center;
+  margin-bottom: var(--space-2);
+}
+.view-all-citations-btn {
+  background-color: transparent;
+  color: var(--primary);
+  border: 1px solid var(--primary);
+  border-radius: var(--border-radius-sm);
+  padding: 3px 10px;
+  font-size: 0.8rem;
+  cursor: pointer;
+  transition: all 0.2s;
+  white-space: nowrap;
+}
+.view-all-citations-btn:hover {
+  background-color: var(--primary);
+  color: var(--text-on-primary);
+}
+.chat-input-container {
+  padding: var(--space-4);
+  border-top: 1px solid var(--border);
+  display: flex;
+  align-items: center;
+  gap: var(--space-3);
+}
+#chat-input {
+  resize: none;
+  flex: 1;
+  border-radius: var(--border-radius-lg);
+  padding: var(--space-3) var(--space-4);
+  min-height: 54px;
+  max-height: 150px;
+  /* căn giữa nội dung/placeholder theo chiều dọc */
+  display: flex;
+  align-items: center;
+}
+#send-message-btn {
+  width: 54px;
+  height: 54px;
+  border-radius: 50%;
+  padding: 0;
+  flex-shrink: 0;
+}
+#send-message-btn i {
+  margin-right: 0;
+  font-size: 1.25rem;
+  flex: 1;
+}
+/* Modals */
+.modal {
+  display: none;
+  position: fixed;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  background-color: rgba(30, 30, 47, 0.5);
+  backdrop-filter: blur(4px);
+  z-index: 1000;
+  overflow: auto;
+  padding: var(--space-4);
+}
+.modal-content {
+  background-color: var(--surface);
+  margin: var(--space-6) auto;
+  width: 92%;
+  max-width: 1000px;
+  border-radius: var(--border-radius-lg);
+  box-shadow: var(--shadow-lg);
+  animation: modalOpen 0.3s ease forwards;
+  overflow: hidden;
+  max-height: calc(100vh - 60px);
+  display: flex;
+  flex-direction: column;
+}
+@keyframes modalOpen {
+  from {
+    opacity: 0;
+    transform: translateY(-20px) scale(0.95);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0) scale(1);
+  }
+}
+.modal-header {
+  padding: var(--space-4) var(--space-5);
+  border-bottom: 1px solid var(--border);
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+}
+.modal-header h3 {
+  margin-bottom: 0;
+  font-size: 1.25rem;
+}
+.close-modal {
+  background: none;
+  border: none;
+  font-size: 1.5rem;
+  cursor: pointer;
+  color: var(--text-tertiary);
+  transition: color var(--transition-fast);
+  line-height: 1;
+}
+.close-modal:hover {
+  color: var(--error);
+}
+.modal-body {
+  padding: var(--space-5);
+  overflow-y: auto;
+  flex: 1;
+}
+.modal-footer {
+  padding: var(--space-4) var(--space-5);
+  border-top: 1px solid var(--border);
+  display: flex;
+  justify-content: flex-end;
+  gap: var(--space-3);
+}
+.form-group {
+  margin-bottom: var(--space-4);
+}
+.form-group label {
+  display: block;
+  margin-bottom: var(--space-2);
+  font-weight: 500;
+}
+.paper-metadata p {
+  margin-bottom: var(--space-3);
+}
+.paper-abstract {
+  margin-top: var(--space-5);
+  padding-top: var(--space-4);
+  border-top: 1px solid var(--border);
+}
+.paper-abstract h4 {
+  margin-bottom: var(--space-3);
+  color: var(--text-secondary);
+}
+/* Loading Overlay */
+.loading-overlay {
+  display: none;
+  position: fixed;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  background-color: rgba(30, 30, 47, 0.7);
+  backdrop-filter: blur(4px);
+  z-index: 2000;
+  justify-content: center;
+  align-items: center;
+}
+.spinner-container {
+  text-align: center;
+  color: white;
+  max-width: 200px;
+  /* Fix alignment */
+  display: flex;
+  flex-direction: column; /* Dọc: spinner trên, text dưới */
+  align-items: center;     /* Căn giữa ngang */
+  justify-content: center; /* Căn giữa dọc */
+}
+.spinner-border {
+  width: 3rem;
+  height: 3rem;
+  border: 0.25rem solid rgba(255, 255, 255, 0.3);
+  border-radius: 50%;
+  border-top-color: white;
+  animation: spinner 0.8s linear infinite;
+}
+.spinner-text {
+  margin-top: var(--space-4);
+  font-size: 1rem;
+  font-weight: 500;
+  max-width: 120px;
+  word-wrap: break-word;      /* Xuống dòng khi cần */
+  text-align: center;         /* Căn giữa text khi wrap */
+}
+.spinner-border {
+  width: 3rem;
+  height: 3rem;
+  border: 0.25rem solid rgba(255, 255, 255, 0.3);
+  border-radius: 50%;
+  border-top-color: white;
+  animation: spinner 0.8s linear infinite;
+}
+@keyframes spinner {
+  to {
+    transform: rotate(360deg);
+  }
+}
+#loading-message {
+  margin-top: var(--space-4);
+  font-size: 1rem;
+  font-weight: 500;
+  width: 500px;
+  text-align: center;
+}
+/* Citations Modal Styles */
+.citation-query {
+  font-style: italic;
+  margin-bottom: var(--space-4);
+  padding: var(--space-3);
+  background-color: var(--surface-variant);
+  border-left: 3px solid var(--primary);
+  border-radius: var(--border-radius-sm);
+  font-size: 1rem;
+}
+.citations-container {
+  overflow-y: auto;
+}
+.citation-section {
+  margin-bottom: var(--space-6);
+  border-bottom: 1px solid var(--border);
+  padding-bottom: var(--space-4);
+}
+.citation-section:last-child {
+  border-bottom: none;
+}
+.citation-section h4 {
+  color: var(--secondary);
+  margin-bottom: var(--space-3);
+  font-size: 1.1rem;
+}
+.citation-list {
+  display: flex;
+  flex-direction: column;
+  gap: var(--space-4);
+}
+.text-citation, .table-citation, .image-citation {
+  display: flex;
+  gap: var(--space-3);
+  background-color: var(--surface-variant);
+  border-radius: var(--border-radius-sm);
+  padding: var(--space-3);
+  box-shadow: var(--shadow-sm);
+  align-items: flex-start;
+  overflow: hidden;
+  width: 100%;
+}
+.citation-number {
+  background-color: var(--primary);
+  color: var(--text-on-primary);
+  width: 24px;
+  height: 24px;
+  min-width: 24px;
+  border-radius: 50%;
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  font-size: 0.8rem;
+  flex-shrink: 0;
+}
+.citation-text {
+  line-height: 1.5;
+  font-size: 0.9rem;
+}
+.citation-table-container {
+  width: 100%;
+  overflow-x: auto;
+  max-width: calc(100% - 40px); /* Account for the citation number */
+  border-radius: var(--border-radius-sm);
+}
+.citation-table {
+  font-size: 0.9rem;
+  overflow: hidden;
+}
+.citation-table table {
+  border-collapse: collapse;
+  width: 100%;
+  max-width: 100%;
+  table-layout: auto;
+  margin: 0;
+}
+.citation-table th, .citation-table td {
+  border: 1px solid var(--border);
+  padding: 4px 8px;
+  word-break: break-word;
+}
+.citation-table th {
+  background-color: var(--surface-variant);
+}
+.citation-image-container {
+  width: 100%;
+  max-width: calc(100% - 40px); /* Account for the citation number */
+  display: flex;
+  justify-content: center;
+  overflow: hidden;
+  margin: 0 auto;
+}
+/* Citation Image Spinner */
+.citation-image-loading {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    min-height: 100px;
+    color: var(--text-tertiary);
+}
+.responsive-image {
+  max-width: 100%;
+  max-height: 280px;
+  object-fit: contain;
+  border-radius: var(--border-radius-sm);
+  display: block;
+}
+/* Footer Styles */
+.app-footer {
+    /* padding: var(--space-4) var(--space-5); */
+    /* background-color: var(--surface-variant); */
+    border-top: 1px solid var(--border);
+    margin-top: auto;
+}
+.footer-content {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    /* max-width: 1200px; */
+    /* margin: 0 auto; */
+    /* font-size: 0.9rem; */
+    color: var(--text-secondary);
+    margin-bottom: 20px;
+    margin-top: 10px;
+}
+.footer-content p {
+    margin: 0;
+    display: flex;
+    align-items: center;
+    height: 100%;
+}
+.github-link {
+    display: flex;
+    align-items: center;
+    gap: var(--space-2);
+    color: var(--primary);
+    text-decoration: none;
+    transition: color 0.2s ease;
+}
+.github-link:hover {
+    color: var(--primary-dark);
+    text-decoration: underline;
+}
+.github-link i {
+    font-size: 1.1rem;
+}
+/* Tooltip Styles */
+.tooltip-container {
+    position: relative;
+    display: inline-flex;
+    /* align-items: center; */
+}
+.tooltip-icon {
+    margin-left: var(--space-2);
+    color: var(--text-tertiary);
+    cursor: help;
+    font-size: 1rem;
+}
+.tooltip-icon:hover {
+    color: var(--primary);
+}
+.tooltip-text {
+    visibility: hidden;
+    position: absolute;
+    z-index: 100;
+    bottom: 125%;
+    /* left: 20%; */
+    transform: translateX(5%) translateY(100%);
+    background-color: var(--text-primary);
+    color: white;
+    text-align: center;
+    border-radius: var(--border-radius-sm);
+    padding: var(--space-2) var(--space-3);
+    width: max-content;
+    max-width: 400px;
+    font-size: 0.7rem;
+    box-shadow: var(--shadow-md);
+    opacity: 0;
+    transition: opacity 0.3s;
+}
+/* .tooltip-text::after {
+    content: "";
+    position: absolute;
+    top: 100%;
+    left: 50%;
+    margin-left: -5px;
+    border-width: 5px;
+    border-style: solid;
+    border-color: var(--text-primary) transparent transparent transparent;
+} */
+.tooltip-container:hover .tooltip-text {
+    visibility: visible;
+    opacity: 1;
+}
+/* Memory feature styling */
+.memory-info {
+    display: block;
+    margin-top: 10px;
+    padding: 8px 12px;
+    background-color: rgba(var(--primary-rgb), 0.1);
+    border-left: 3px solid var(--primary);
+    border-radius: 4px;
+    font-style: italic;
+    color: var(--primary-dark);
+}
+/* Responsive Styles */
+@media (max-width: 1200px) {
+  .filters-grid {
+    grid-template-columns: 1fr;
+  }
+  .content-row {
+    flex-direction: column;
+    height: auto;
+    gap: var(--space-4);
+  }
+  .panel {
+    height: 600px;
+  }
+}
+@media (max-width: 768px) {
+  .header-content h1 {
+    font-size: 1.75rem;
+  }
+  .header-content p {
+    display: none;
+  }
+  .date-range-inputs {
+    flex-direction: column;
+  }
+  .upload-container {
+    flex-direction: column;
+  }
+  .modal-content {
+    margin: var(--space-4) auto;
+  }
+}
+@media (max-width: 576px) {
+  :root {
+    --space-5: 1.25rem;
+  }
+  .header-content h1 {
+    font-size: 1.5rem;
+  }
+  .app-container {
+    padding: 0 var(--space-3);
+  }
+  .panel {
+    border-radius: var(--border-radius-md);
+  }
+  .chat-messages {
+    padding: var(--space-3);
+  }
+}
+/* Utility Classes */
+.hidden {
+  display: none !important;
+}
+.show {
+  display: flex !important;
+}
+/* Add Google Fonts */
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
+@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');

static/data/arxiv_cs_subjects.json ADDED Viewed

	@@ -0,0 +1,162 @@

+[
+  {
+    "tag": "cs.AI",
+    "name": "Artificial Intelligence"
+  },
+  {
+    "tag": "cs.CL",
+    "name": "Computation and Language"
+  },
+  {
+    "tag": "cs.CC",
+    "name": "Computational Complexity"
+  },
+  {
+    "tag": "cs.CE",
+    "name": "Computational Engineering, Finance, and Science"
+  },
+  {
+    "tag": "cs.CG",
+    "name": "Computational Geometry"
+  },
+  {
+    "tag": "cs.GT",
+    "name": "Computer Science and Game Theory"
+  },
+  {
+    "tag": "cs.CV",
+    "name": "Computer Vision and Pattern Recognition"
+  },
+  {
+    "tag": "cs.CY",
+    "name": "Computers and Society"
+  },
+  {
+    "tag": "cs.CR",
+    "name": "Cryptography and Security"
+  },
+  {
+    "tag": "cs.DS",
+    "name": "Data Structures and Algorithms"
+  },
+  {
+    "tag": "cs.DB",
+    "name": "Databases"
+  },
+  {
+    "tag": "cs.DL",
+    "name": "Digital Libraries"
+  },
+  {
+    "tag": "cs.DM",
+    "name": "Discrete Mathematics"
+  },
+  {
+    "tag": "cs.DC",
+    "name": "Distributed, Parallel, and Cluster Computing"
+  },
+  {
+    "tag": "cs.ET",
+    "name": "Emerging Technologies"
+  },
+  {
+    "tag": "cs.FL",
+    "name": "Formal Languages and Automata Theory"
+  },
+  {
+    "tag": "cs.GL",
+    "name": "General Literature"
+  },
+  {
+    "tag": "cs.GR",
+    "name": "Graphics"
+  },
+  {
+    "tag": "cs.AR",
+    "name": "Hardware Architecture"
+  },
+  {
+    "tag": "cs.HC",
+    "name": "Human-Computer Interaction"
+  },
+  {
+    "tag": "cs.IR",
+    "name": "Information Retrieval"
+  },
+  {
+    "tag": "cs.IT",
+    "name": "Information Theory"
+  },
+  {
+    "tag": "cs.LO",
+    "name": "Logic in Computer Science"
+  },
+  {
+    "tag": "cs.LG",
+    "name": "Machine Learning"
+  },
+  {
+    "tag": "cs.MS",
+    "name": "Mathematical Software"
+  },
+  {
+    "tag": "cs.MA",
+    "name": "Multiagent Systems"
+  },
+  {
+    "tag": "cs.MM",
+    "name": "Multimedia"
+  },
+  {
+    "tag": "cs.NI",
+    "name": "Networking and Internet Architecture"
+  },
+  {
+    "tag": "cs.NE",
+    "name": "Neural and Evolutionary Computing"
+  },
+  {
+    "tag": "cs.NA",
+    "name": "Numerical Analysis"
+  },
+  {
+    "tag": "cs.OS",
+    "name": "Operating Systems"
+  },
+  {
+    "tag": "cs.OH",
+    "name": "Other Computer Science"
+  },
+  {
+    "tag": "cs.PF",
+    "name": "Performance"
+  },
+  {
+    "tag": "cs.PL",
+    "name": "Programming Languages"
+  },
+  {
+    "tag": "cs.RO",
+    "name": "Robotics"
+  },
+  {
+    "tag": "cs.SI",
+    "name": "Social and Information Networks"
+  },
+  {
+    "tag": "cs.SE",
+    "name": "Software Engineering"
+  },
+  {
+    "tag": "cs.SD",
+    "name": "Sound"
+  },
+  {
+    "tag": "cs.SC",
+    "name": "Symbolic Computation"
+  },
+  {
+    "tag": "cs.SY",
+    "name": "Systems and Control"
+  }
+]

static/index.html ADDED Viewed

	@@ -0,0 +1,304 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta name="description" content="A multi-modal RAG application for arXiv CS papers that allows searching and intelligent conversations">
+    <title>arXivCSRAG - Multi-Modal RAG Application</title>
+    <!-- Stylesheets -->
+    <link rel="stylesheet" href="/static/css/modern-styles.css">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css">
+    <!-- Favicon -->
+    <link rel="icon" type="image/svg+xml" href="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'%3E%3Cpath fill='%234361ee' d='M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-5 14H7v-2h7v2zm3-4H7v-2h10v2zm0-4H7V7h10v2z'/%3E%3C/svg%3E">
+</head>
+<body>
+    <div class="app-container">
+        <!-- Header -->
+        <header>
+            <div class="header-content">
+                <h1>arXivCSRAG</h1>
+                <!-- <p>Multi-Modal RAG Application</p> -->
+            </div>
+            <button id="configure-api-keys-btn" class="btn btn-primary">
+                <i class="bi bi-key-fill"></i> Configure API Keys
+            </button>
+        </header>
+        <main>
+            <div class="content-row">
+                <!-- Left Panel: Search and Results -->
+                <div class="left-panel panel">
+                    <div class="nav-tabs" role="tablist">
+                        <button class="nav-link active" id="search-tab" data-tab="search-panel" role="tab" aria-selected="true">
+                            <i class="bi bi-search"></i> Search
+                        </button>
+                        <button class="nav-link" id="results-tab" data-tab="results-panel" role="tab" aria-selected="false">
+                            <i class="bi bi-list-ul"></i> Results
+                        </button>
+                    </div>
+                    <div class="tab-content">
+                        <div class="tab-pane active" id="search-panel" role="tabpanel">
+                            <!-- Search Panel Section -->
+                            <div class="search-section">
+                                <!-- <h2>Search arXiv Papers</h2> -->
+                                <div class="filters-grid">
+                                    <!-- Subject Tags Filter -->
+                                    <div class="filter-group">
+                                        <div class="tooltip-container">
+                                            <label for="subject-tags-select">Subject Areas</label>
+                                            <i class="bi bi-info-circle tooltip-icon">
+                                                <span class="tooltip-text">Hold Ctrl (or Cmd on Mac) and click to select multiple tags.</span>
+                                            </i>
+                                        </div>
+                                        <select id="subject-tags-select" class="form-control form-select" multiple>
+                                            <!-- Will be populated via JavaScript -->
+                                        </select>
+                                    </div>
+                                    <!-- Date Range Filter -->
+                                    <div class="filter-group">
+                                        <label>Publication Date Range</label>
+                                        <div class="date-range-inputs">
+                                            <input type="date" id="start-date" class="form-control" placeholder="From">
+                                            <input type="date" id="end-date" class="form-control" placeholder="To">
+                                        </div>
+                                    </div>
+                                    <!-- Search Query -->
+                                    <div class="filter-group">
+                                        <label for="search-query">Search Query</label>
+                                        <input type="text" id="search-query" class="form-control" placeholder="Enter keywords, phrases...">
+                                    </div>
+                                    <!-- Max Results Filter -->
+                                    <div class="filter-group">
+                                        <label for="max-results">Max Results</label>
+                                        <input type="number" id="max-results" class="form-control" value="10" min="1" max="100">
+                                    </div>
+                                </div>
+                                <!-- Search Button -->
+                                <button id="search-button" class="btn btn-primary search-btn">
+                                    <i class="bi bi-search"></i> Search for arXiv Papers
+                                </button>
+                                <!-- Alternative: Upload PDF -->
+                                <div class="upload-section">
+                                    <p>Or upload any PDF file:  </p>
+                                    <div class="upload-container">
+                                        <div class="file-input-wrapper">
+                                            <label class="file-input-button" for="pdf-upload">
+                                                <i class="bi bi-file-earmark-pdf"></i>
+                                                <span>Choose PDF File</span>
+                                            </label>
+                                            <input type="file" id="pdf-upload" accept=".pdf">
+                                        </div>
+                                        <div class="file-name-display">
+                                            <i class="bi bi-check-circle"></i>
+                                            <span id="selected-file-name">No file selected</span>
+                                        </div>
+                                        <button id="upload-button" class="btn btn-primary">
+                                            <i class="bi bi-upload"></i> Upload PDF
+                                        </button>
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                        <div class="tab-pane" id="results-panel" role="tabpanel">
+                            <!-- Search Results Section -->
+                            <div class="results-section">
+                                <!-- <h2>Search Results</h2> -->
+                                <div id="results-container" class="results-container">
+                                    <!-- Will be populated via JavaScript -->
+                                    <p class="no-results">No results to display. Search for papers above.</p>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <!-- Right Panel: Chat Interface -->
+                <div class="right-panel panel">
+                    <div class="chat-panel">
+                        <div class="chat-header">
+                            <h2><i class="bi bi-chat-left-text"></i> Chat with Paper</h2>
+                            <button id="reset-chat-btn" class="btn btn-outline" disabled>
+                                <i class="bi bi-arrow-repeat"></i> Reset Chat
+                            </button>
+                        </div>
+                        <!-- Chat Messages Container -->
+                        <div id="chat-messages" class="chat-messages">
+                            <div class="system-message">
+                                <p>Select a paper from the search results or upload a PDF to start chatting.</p>
+                            </div>
+                        </div>
+                        <!-- Chat Input -->
+                        <div class="chat-input-container">
+                            <textarea id="chat-input" class="form-control" placeholder="Ask a question about the paper..." disabled></textarea>
+                            <button id="send-message-btn" class="btn btn-primary" disabled>
+                                <i class="bi bi-send-fill"></i>
+                            </button>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </main>
+        <!-- Footer -->
+        <footer class="app-footer">
+            <div class="footer-content">
+                <p>© 2025 arXivCSRAG</p>
+                <a href="https://github.com/YuITC/arXivRAG-Multimodal-RAG-Chatbot-Application" target="_blank" class="github-link">
+                    <i class="bi bi-github"></i> GitHub Repository
+                </a>
+            </div>
+        </footer>
+    </div>
+    <!-- API Keys Configuration Modal -->
+    <div id="api-keys-modal" class="modal">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h3><i class="bi bi-key"></i> Configure API Keys</h3>
+                <button class="close-modal">&times;</button>
+            </div>
+            <div class="modal-body">
+                <p>Enter your API keys to use the application:</p>
+                <div class="form-group">
+                    <label for="gemini-api-key">Google Gemini API Key</label>
+                    <input type="text" id="gemini-api-key" class="form-control" placeholder="Enter your Gemini API key">
+                </div>
+                <div class="form-group">
+                    <label for="huggingface-token">Hugging Face Token</label>
+                    <input type="text" id="huggingface-token" class="form-control" placeholder="Enter your Hugging Face token">
+                </div>
+            </div>
+            <div class="modal-footer">
+                <button id="save-api-keys-btn" class="btn btn-primary">Save</button>
+                <button class="btn btn-secondary close-btn">Cancel</button>
+            </div>
+        </div>
+    </div>
+    <!-- Paper Information Modal -->
+    <div id="paper-info-modal" class="modal">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h3 id="paper-title">Paper Title</h3>
+                <button class="close-modal">&times;</button>
+            </div>
+            <div class="modal-body">
+                <div class="paper-metadata">
+                    <p><strong>Authors:</strong> <span id="paper-authors"></span></p>
+                    <p><strong>Published:</strong> <span id="paper-published"></span></p>
+                    <p><strong>Categories:</strong> <span id="paper-categories"></span></p>
+                    <p><strong>arXiv ID:</strong> <span id="paper-id"></span></p>
+                </div>
+                <div class="paper-abstract">
+                    <h4>Abstract</h4>
+                    <p id="paper-abstract"></p>
+                </div>
+            </div>
+            <div class="modal-footer">
+                <button id="open-arxiv-btn" class="btn btn-outline">
+                    <i class="bi bi-box-arrow-up-right"></i> Open arXiv Page
+                </button>
+                <button id="view-pdf-btn" class="btn btn-outline">
+                    <i class="bi bi-file-earmark-pdf"></i> View PDF
+                </button>
+                <button id="download-pdf-btn" class="btn btn-outline">
+                    <i class="bi bi-download"></i> Download PDF
+                </button>
+                <button id="chat-with-paper-btn" class="btn btn-primary">
+                    <i class="bi bi-chat-dots"></i> Chat with Paper
+                </button>
+            </div>
+        </div>
+    </div>
+    <!-- Loading Overlay -->
+    <!-- <div id="loading-overlay" class="loading-overlay">
+        <div class="spinner-container">
+            <div class="spinner-border" role="status">
+                <span class="visually-hidden">Loading...</span>
+            </div>
+            <p id="loading-message">Processing...</p>
+        </div>
+    </div> -->
+    <div id="loading-overlay" class="loading-overlay">
+        <div class="spinner-container">
+            <div class="spinner-border"></div>
+            <p class="spinner-text">Loading...</p>
+            <p id="loading-message">Processing...</p>
+        </div>
+    </div>
+    <!-- Scripts -->
+    <script src="/static/js/api.js"></script>
+    <script src="/static/js/ui.js"></script>
+    <script src="/static/js/chat.js"></script>
+    <script src="/static/js/main.js"></script>
+    <!-- Custom Tab JS -->
+    <script>
+        // Custom tab functionality (without Bootstrap dependency)
+        document.addEventListener('DOMContentLoaded', function() {
+            const tabLinks = document.querySelectorAll('.nav-link');
+            tabLinks.forEach(tabLink => {
+                tabLink.addEventListener('click', function(e) {
+                    e.preventDefault();
+                    // Remove active class from all tabs and panes
+                    document.querySelectorAll('.nav-link').forEach(tab => {
+                        tab.classList.remove('active');
+                        tab.setAttribute('aria-selected', 'false');
+                    });
+                    document.querySelectorAll('.tab-pane').forEach(pane => {
+                        pane.classList.remove('active');
+                    });
+                    // Add active class to clicked tab and its pane
+                    this.classList.add('active');
+                    this.setAttribute('aria-selected', 'true');
+                    const targetId = this.getAttribute('data-tab');
+                    document.getElementById(targetId).classList.add('active');
+                });
+            });
+        });
+    </script>
+    <!-- Citations Modal -->
+    <div id="citations-modal" class="modal">
+        <div class="modal-content">
+            <div class="modal-header">
+                <h3>Sources for Query</h3>
+                <span class="close-modal">&times;</span>
+            </div>
+            <div class="modal-body">
+                <p id="citation-query" class="citation-query"></p>
+                <div id="citations-container" class="citations-container">
+                    <!-- Citations will be rendered here -->
+                </div>
+            </div>
+            <div class="modal-footer">
+                <button class="btn btn-secondary close-btn">Close</button>
+            </div>
+        </div>
+    </div>
+</body>
+</html>

static/js/api.js ADDED Viewed

	@@ -0,0 +1,219 @@

+/**
+ * API Service for the arXivCSRAG application
+ * Manages all API requests to the backend
+ */
+class ApiService {
+    /**
+     * Configure API keys
+     * @param   {string} geminiApiKey     - The Google Gemini API key
+     * @param   {string} huggingfaceToken - The Hugging Face token
+     * @returns {Promise}                 - The API response
+     */
+    static async configureApiKeys(geminiApiKey, huggingfaceToken) {
+        try {
+            const response = await fetch('/api/configure', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({
+                    gemini_api_key   : geminiApiKey,
+                    huggingface_token: huggingfaceToken
+                })
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error configuring API keys:', error);
+            throw error;
+        }
+    }
+    /**
+     * Fetch papers from arXiv
+     * @param   {Object} searchParams - The search parameters
+     * @returns {Promise}             - The API response
+     */
+    static async fetchPapers(searchParams) {
+        try {
+            const response = await fetch('/api/fetch-papers', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify(searchParams)
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error fetching papers:', error);
+            throw error;
+        }
+    }
+    /**
+     * Get paper metadata
+     * @param   {string} arxivId - The arXiv ID of the paper
+     * @returns {Promise}        - The API response
+     */
+    static async getPaperMetadata(arxivId) {
+        try {
+            const response = await fetch('/api/paper-metadata', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({
+                    arxiv_id: arxivId
+                })
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error getting paper metadata:', error);
+            throw error;
+        }
+    }
+    /**
+     * Download a paper
+     * @param   {string} arxivId - The arXiv ID of the paper
+     * @returns {Promise}        - The API response
+     */
+    static async downloadPaper(arxivId) {
+        try {
+            const response = await fetch('/api/download-paper', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({
+                    arxiv_id: arxivId
+                })
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error downloading paper:', error);
+            throw error;
+        }
+    }
+    /**
+     * Upload a paper
+     * @param   {File} file - The PDF file to upload
+     * @returns {Promise}   - The API response
+     */
+    static async uploadPaper(file) {
+        try {
+            const formData = new FormData();
+            formData.append('file', file);
+            const response = await fetch('/api/upload-paper', {
+                method: 'POST',
+                body  : formData
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error uploading paper:', error);
+            throw error;
+        }
+    }
+    /**
+     * Process a paper for RAG
+     * @param   {string} filePath - The path to the PDF file
+     * @returns {Promise}         - The API response
+     */
+    static async processPaper(filePath) {
+        try {
+            const formData = new FormData();
+            formData.append('file_path', filePath);
+            const response = await fetch('/api/process-paper', {
+                method: 'POST',
+                body  : formData
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error processing paper:', error);
+            throw error;
+        }
+    }
+    /**
+     * Chat with a processed paper
+     * @param   {string} message - The user's message
+     * @returns {Promise}        - The API response
+     */
+    static async chatWithPaper(message) {
+        try {
+            const response = await fetch('/api/chat', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({
+                    message: message
+                })
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error chatting with paper:', error);
+            throw error;
+        }
+    }
+    /**
+     * Reset the chat
+     * @returns {Promise} - The API response
+     */
+    static async resetChat() {
+        try {
+            const response = await fetch('/api/reset-chat', {
+                method: 'POST'
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error resetting chat:', error);
+            throw error;
+        }
+    }
+    /**
+     * Fetch citations for a specific query
+     * @param   {string} message - The query message
+     * @returns {Promise}        - The API response with citations
+     */
+    static async fetchCitations(message) {
+        try {
+            const response = await fetch('/api/fetch-citations', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify({
+                    message: message
+                })
+            });
+            return await response.json();
+        } catch (error) {
+            console.error('Error fetching citations:', error);
+            throw error;
+        }
+    }
+}

static/js/chat.js ADDED Viewed

	@@ -0,0 +1,198 @@

+/**
+ * Chat Manager for the arXivCSRAG application
+ * Handles chat interactions and state
+ */
+class ChatManager {
+    constructor(uiManager) {
+        this.uiManager        = uiManager;
+        this.currentPaperPath = null;
+        this.isProcessed      = false;
+        // Initialize chat events
+        this.initEvents();
+    }
+    /**
+     * Initialize chat event listeners
+     */
+    initEvents() {
+        const chatInput   = document.getElementById('chat-input');
+        const sendButton  = document.getElementById('send-message-btn');
+        const resetButton = document.getElementById('reset-chat-btn');
+        // Send message on button click
+        sendButton.addEventListener('click', () => {
+            this.sendMessage();
+        });
+        // Send message on Enter key (but allow Shift+Enter for new lines)
+        chatInput.addEventListener('keydown', (event) => {
+            if (event.key === 'Enter' && !event.shiftKey) {
+                event.preventDefault();
+                this.sendMessage();
+            }
+        });
+        // Reset chat
+        resetButton.addEventListener('click', () => {
+            this.resetChat();
+        });
+    }
+    /**
+     * Send a message to the API
+     */
+    async sendMessage() {
+        const chatInput = document.getElementById('chat-input');
+        const message   = chatInput.value.trim();
+        if (!message || !this.isProcessed) {
+            return;
+        }
+        // Add user message to chat
+        this.uiManager.addChatMessage(message, true);
+        // Clear input
+        chatInput.value = '';
+        // Disable chat while waiting for response
+        this.uiManager.disableChat();
+        this.uiManager.showLoading('Getting answer...');
+        try {
+            // Send message to API
+            const response = await ApiService.chatWithPaper(message);
+            if (response.status === 'success') {
+                // Add bot response to chat
+                this.uiManager.addChatMessage(response.response, false, response.citations);
+            } else {
+                this.uiManager.addSystemMessage('Error: Failed to get a response. Please try again.');
+            }
+        } catch (error) {
+            console.error('Error in chat:', error);
+            this.uiManager.addSystemMessage('Error: An unexpected error occurred. Please try again.');
+        } finally {
+            // Re-enable chat
+            this.uiManager.enableChat();
+            this.uiManager.hideLoading();
+        }
+    }
+    /**
+     * Process a paper for chatting
+     * @param {string} filePath - Path to the paper file
+     */
+    async processPaper(filePath) {
+        if (!filePath) {
+            this.uiManager.addSystemMessage('Error: No paper file specified.');
+            return;
+        }
+        this.currentPaperPath = filePath;
+        this.uiManager.clearChat();
+        this.uiManager.disableChat();
+        this.uiManager.showLoading('Processing paper... This may take about 1-2 minutes.');
+        try {
+            const response = await ApiService.processPaper(filePath);
+            if (response.status === 'success') {
+                this.isProcessed = true;
+                this.uiManager.enableChat();
+                this.uiManager.addSystemMessage(
+                    `Extracted ${response.stats.texts} text chunks, ${response.stats.tables} tables, and ${response.stats.images} images.<br>
+                    Paper processed successfully! You can now ask questions about the paper.<br>
+                    <span class="memory-info">🧠 The chatbot now has conversational memory and will remember your previous questions.</span>`
+                );
+            } else {
+                this.uiManager.addSystemMessage('Error: Failed to process paper. Please try again.');
+            }
+        } catch (error) {
+            console.error('Error processing paper:', error);
+            this.uiManager.addSystemMessage('Error: An unexpected error occurred while processing the paper. Please try again.');
+        } finally {
+            this.uiManager.hideLoading();
+        }
+    }
+    /**
+     * Reset the chat session
+     */
+    async resetChat() {
+        this.uiManager.showLoading('Resetting chat and conversation memory...');
+        try {
+            await ApiService.resetChat();
+            this.isProcessed = false;
+            this.currentPaperPath = null;
+            this.uiManager.clearChat();
+            this.uiManager.disableChat();
+            this.uiManager.addSystemMessage('Select a paper from the search results or upload a PDF to start chatting. Conversation memory has been reset.');
+        } catch (error) {
+            console.error('Error resetting chat:', error);
+            this.uiManager.addSystemMessage('Error: Failed to reset chat. Please try again.');
+        } finally {
+            this.uiManager.hideLoading();
+        }
+    }
+    /**
+     * Download and process a paper by arXiv ID
+     * @param {string} arxivId - The arXiv ID of the paper
+     */
+    async downloadAndProcessPaper(arxivId) {
+        this.uiManager.showLoading('Downloading paper...');
+        try {
+            const response = await ApiService.downloadPaper(arxivId);
+            if (response.status === 'success') {
+                this.uiManager.hideModal(this.uiManager.paperInfoModal);
+                await this.processPaper(response.file_path);
+            } else {
+                this.uiManager.addSystemMessage('Error: Failed to download paper. Please try again.');
+                this.uiManager.hideLoading();
+            }
+        } catch (error) {
+            console.error('Error downloading paper:', error);
+            this.uiManager.addSystemMessage('Error: An unexpected error occurred while downloading the paper. Please try again.');
+            this.uiManager.hideLoading();
+        }
+    }
+    /**
+     * Fetch and display citations for a specific message
+     * @param {string} message - The message to get citations for
+     */
+    async fetchAndDisplayCitations(message) {
+        if (!message || !this.isProcessed) {
+            return;
+        }
+        this.uiManager.showLoading('Fetching sources...');
+        try {
+            // Fetch citations from API
+            const response = await ApiService.fetchCitations(message);
+            if (response.status === 'success') {
+                // Create a modal to display the citations
+                this.uiManager.showCitationsModal(message, response.citations);
+            } else {
+                this.uiManager.addSystemMessage('Error: Failed to fetch citations. Please try again.');
+            }
+        } catch (error) {
+            console.error('Error fetching citations:', error);
+            this.uiManager.addSystemMessage('Error: An unexpected error occurred while fetching citations.');
+        } finally {
+            this.uiManager.hideLoading();
+        }
+    }
+}

static/js/main.js ADDED Viewed

	@@ -0,0 +1,196 @@

+/**
+ * Main application entry point for arXivCSRAG
+ */
+document.addEventListener('DOMContentLoaded', () => {
+    // Initialize UI and Chat managers
+    const uiManager   = new UIManager();
+    const chatManager = new ChatManager(uiManager);
+    // API Keys configuration
+    const saveApiKeysBtn = document.getElementById('save-api-keys-btn');
+    saveApiKeysBtn.addEventListener('click', async () => {
+        const geminiApiKey     = document.getElementById('gemini-api-key').value.trim();
+        const huggingfaceToken = document.getElementById('huggingface-token').value.trim();
+        if (!geminiApiKey || !huggingfaceToken) {
+            alert('Please enter both API keys.');
+            return;
+        }
+        uiManager.showLoading('Configuring API keys...');
+        try {
+            const response = await ApiService.configureApiKeys(geminiApiKey, huggingfaceToken);
+            if (response.status === 'success') {
+                uiManager.hideModal(uiManager.apiKeysModal);
+                alert('API keys configured successfully!');
+            } else {
+                alert('Failed to configure API keys. Please try again.');
+            }
+        } catch (error) {
+            console.error('Error saving API keys:', error);
+            alert('An unexpected error occurred. Please try again.');
+        } finally {
+            uiManager.hideLoading();
+        }
+    });
+    // Paper search functionality
+    const searchButton = document.getElementById('search-button');
+    searchButton.addEventListener('click', async () => {
+        // Get search parameters
+        const subjectTags = Array.from(document.getElementById('subject-tags-select').selectedOptions).map(option => option.value);
+        const startDate   = document.getElementById('start-date').value;
+        const endDate     = document.getElementById('end-date').value;
+        const maxResults  = parseInt(document.getElementById('max-results').value) || 10;
+        const query       = document.getElementById('search-query').value.trim();
+        // if (!query) {
+        //     alert('Please enter a search query.');
+        //     return;
+        // }
+        uiManager.showLoading('Searching for papers...');
+        try {
+            const response = await ApiService.fetchPapers({
+                subject_tags: subjectTags.length > 0 ? subjectTags : null,
+                start_date  : startDate || null,
+                end_date    : endDate || null,
+                max_results : maxResults,
+                query       : query
+            });
+            if (response.status === 'success') {
+                uiManager.renderSearchResults(response.papers);
+            } else {
+                alert('Failed to fetch papers. Please try again.');
+            }
+        } catch (error) {
+            console.error('Error searching papers:', error);
+            alert('An unexpected error occurred. Please try again.');
+        } finally {
+            uiManager.hideLoading();
+        }
+    });
+    // Upload paper functionality
+    const uploadButton = document.getElementById('upload-button');
+    const fileInput = document.getElementById('pdf-upload');
+    const fileNameDisplay = document.querySelector('.file-name-display');
+    const selectedFileName = document.getElementById('selected-file-name');
+    // Handle file selection display
+    fileInput.addEventListener('change', (event) => {
+        const file = fileInput.files[0];
+        console.log('File selected:', file ? file.name : 'No file');
+        if (file) {
+            selectedFileName.textContent = file.name;
+            fileNameDisplay.classList.add('active');
+            uploadButton.classList.add('file-selected');
+            uploadButton.innerHTML = '<i class="bi bi-upload"></i> Upload "' + file.name.substring(0, 15) + (file.name.length > 15 ? '...' : '') + '"';
+            console.log('File name display should be visible now');
+        } else {
+            selectedFileName.textContent = 'No file selected';
+            fileNameDisplay.classList.remove('active');
+            uploadButton.classList.remove('file-selected');
+            uploadButton.innerHTML = '<i class="bi bi-upload"></i> Upload PDF';
+        }
+    });
+    uploadButton.addEventListener('click', async () => {
+        const file = fileInput.files[0];
+        if (!file) {
+            alert('Please select a PDF file to upload.');
+            return;
+        }
+        if (file.type !== 'application/pdf') {
+            alert('Please select a valid PDF file.');
+            return;
+        }
+        uiManager.showLoading('Uploading paper...');
+        try {
+            const response = await ApiService.uploadPaper(file);
+            if (response.status === 'success') {
+                await chatManager.processPaper(response.file_path);
+            } else {
+                alert('Failed to upload paper. Please try again.');
+                uiManager.hideLoading();
+            }
+        } catch (error) {
+            console.error('Error uploading paper:', error);
+            alert('An unexpected error occurred. Please try again.');
+            uiManager.hideLoading();
+        }
+    });
+    // Paper info modal buttons
+    document.getElementById('open-arxiv-btn').addEventListener('click', function() {
+        const url = this.dataset.url;
+        if (url) {
+            window.open(url, '_blank');
+        }
+    });
+    document.getElementById('view-pdf-btn').addEventListener('click', function() {
+        const url = this.dataset.url;
+        if (url) {
+            window.open(url, '_blank');
+        }
+    });
+    document.getElementById('download-pdf-btn').addEventListener('click', function() {
+        const paperId = this.dataset.paperId;
+        if (paperId) {
+            // Create a temporary link to download the file
+            uiManager.showLoading('Preparing download...');
+            ApiService.downloadPaper(paperId)
+                .then(response => {
+                    if (response.status === 'success') {
+                        // Create a link to download the file directly
+                        const a = document.createElement('a');
+                        a.href = response.file_path;
+                        a.download = `${paperId.replace('/', '_')}.pdf`;
+                        document.body.appendChild(a);
+                        a.click();
+                        document.body.removeChild(a);
+                    } else {
+                        alert('Failed to download paper. Please try again.');
+                    }
+                })
+                .catch(error => {
+                    console.error('Error downloading paper:', error);
+                    alert('An unexpected error occurred. Please try again.');
+                })
+                .finally(() => {
+                    uiManager.hideLoading();
+                });
+        }
+    });
+    document.getElementById('chat-with-paper-btn').addEventListener('click', function() {
+        const paperId = this.dataset.paperId;
+        if (paperId) {
+            chatManager.downloadAndProcessPaper(paperId);
+        }
+    });
+    // Show API keys modal on first load
+    setTimeout(() => {
+        uiManager.showModal(uiManager.apiKeysModal);
+    }, 500);
+});

static/js/ui.js ADDED Viewed

	@@ -0,0 +1,364 @@

+/**
+ * UI Manager for the arXivCSRAG application
+ * Handles UI elements, modals, and rendering
+ */
+class UIManager {
+    constructor() {
+        // Cache DOM elements
+        this.apiKeysModal      = document.getElementById('api-keys-modal');
+        this.paperInfoModal    = document.getElementById('paper-info-modal');
+        this.citationsModal    = document.getElementById('citations-modal');
+        this.loadingOverlay    = document.getElementById('loading-overlay');
+        this.loadingMessage    = document.getElementById('loading-message');
+        this.resultsContainer  = document.getElementById('results-container');
+        this.chatMessages      = document.getElementById('chat-messages');
+        this.subjectTagsSelect = document.getElementById('subject-tags-select');
+        // Initialize UI
+        this.initEventListeners();
+        this.loadSubjectTags();
+    }
+    /**
+     * Initialize event listeners for UI elements
+     */
+    initEventListeners() {
+        // Modal close buttons
+        document.querySelectorAll('.close-modal, .close-btn').forEach(button => {
+            button.addEventListener('click', () => {
+                this.hideModal(this.apiKeysModal);
+                this.hideModal(this.paperInfoModal);
+                this.hideModal(this.citationsModal);
+            });
+        });
+        // Configure API Keys button
+        document.getElementById('configure-api-keys-btn').addEventListener('click', () => {
+            this.showModal(this.apiKeysModal);
+        });
+        // Window click to close modals
+        window.addEventListener('click', (event) => {
+            if (event.target === this.apiKeysModal) {
+                this.hideModal(this.apiKeysModal);
+            } else if (event.target === this.paperInfoModal) {
+                this.hideModal(this.paperInfoModal);
+            } else if (event.target === this.citationsModal) {
+                this.hideModal(this.citationsModal);
+            }
+        });
+    }
+    /**
+     * Load subject tags from JSON file
+     */
+    async loadSubjectTags() {
+        try {
+            const response = await fetch('/data/arxiv_cs_subjects.json');
+            const subjects = await response.json();
+            // Clear existing options
+            this.subjectTagsSelect.innerHTML = '';
+            // Add options to select
+            subjects.forEach(subject => {
+                const option       = document.createElement('option');
+                option.value       = subject.tag;
+                option.textContent = `${subject.tag}: ${subject.name}`;
+                this.subjectTagsSelect.appendChild(option);
+            });
+        } catch (error) {
+            console.error('Error loading subject tags:', error);
+        }
+    }
+    /**
+     * Show a modal
+     * @param {HTMLElement} modal - The modal to show
+     */
+    showModal(modal) {
+        modal.style.display = 'block';
+    }
+    /**
+     * Hide a modal
+     * @param {HTMLElement} modal - The modal to hide
+     */
+    hideModal(modal) {
+        modal.style.display = 'none';
+    }
+    /**
+     * Show the loading overlay
+     * @param {string} message - The loading message to display
+     */
+    showLoading(message = 'Processing...') {
+        this.loadingMessage.textContent   = message;
+        this.loadingOverlay.style.display = 'flex';
+    }
+    /**
+     * Hide the loading overlay
+     */
+    hideLoading() {
+        this.loadingOverlay.style.display = 'none';
+    }
+    /**
+     * Render search results
+     * @param {Array} papers - The papers to render
+     */
+    renderSearchResults(papers) {
+        if (!papers || papers.length === 0) {
+            this.resultsContainer.innerHTML = '<p class="no-results">No papers found matching your criteria.</p>';
+            return;
+        }
+        this.resultsContainer.innerHTML = '';
+        papers.forEach(paper => {
+            const paperElement           = document.createElement('div');
+            paperElement.className       = 'paper-item';
+            paperElement.dataset.paperId = paper.arxiv_id;
+            const categories = paper.categories.map(cat =>
+                `<span class="paper-category">${cat}</span>`
+            ).join('');
+            paperElement.innerHTML = `
+                <div class="paper-item-header">
+                    <h4 class="paper-title">${paper.title}</h4>
+                    <span class="paper-date">${paper.published}</span>
+                </div>
+                <p class="paper-authors">${paper.authors.join(', ')}</p>
+                <div class="paper-categories">${categories}</div>
+            `;
+            paperElement.addEventListener('click', () => {
+                this.showPaperInfo(paper);
+            });
+            this.resultsContainer.appendChild(paperElement);
+        });
+    }
+    /**
+     * Show paper information in the modal
+     * @param {Object} paper - The paper data
+     */
+    showPaperInfo(paper) {
+        // Set modal content
+        document.getElementById('paper-title').textContent      = paper.title;
+        document.getElementById('paper-authors').textContent    = paper.authors.join(', ');
+        document.getElementById('paper-published').textContent  = paper.published;
+        document.getElementById('paper-categories').textContent = paper.categories.join(', ');
+        document.getElementById('paper-id').textContent         = paper.arxiv_id;
+        document.getElementById('paper-abstract').textContent   = paper.abstract;
+        // Set data attributes for buttons
+        const openArxivBtn     = document.getElementById('open-arxiv-btn');
+        const viewPdfBtn       = document.getElementById('view-pdf-btn');
+        const downloadPdfBtn   = document.getElementById('download-pdf-btn');
+        const chatWithPaperBtn = document.getElementById('chat-with-paper-btn');
+        openArxivBtn.dataset.url         = paper.entry_id;
+        viewPdfBtn.dataset.url           = paper.pdf_url;
+        downloadPdfBtn.dataset.paperId   = paper.arxiv_id;
+        chatWithPaperBtn.dataset.paperId = paper.arxiv_id;
+        // Show the modal
+        this.showModal(this.paperInfoModal);
+    }
+    /**
+     * Add a message to the chat interface
+     * @param {string} message   - The message text
+     * @param {boolean} isUser   - Whether the message is from the user
+     * @param {Object} citations - Citations from the AI response
+     */
+    addChatMessage(message, isUser, citations = null) {
+        const messageElement     = document.createElement('div');
+        messageElement.className = `message ${isUser ? 'user-message' : 'bot-message'}`;
+        // If it's a bot message, parse markdown (simple version)
+        if (!isUser) {
+            // Simple markdown parsing for links, bold, italic, code
+            const formattedMessage = message
+                .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
+                .replace(/\*(.*?)\*/g, '<em>$1</em>')
+                .replace(/`(.*?)`/g, '<code>$1</code>')
+                .replace(/\n/g, '<br>');
+            messageElement.innerHTML = formattedMessage;                // Add citations if available
+                if (citations && (citations.texts.length > 0 || citations.images.length > 0 || citations.tables.length > 0)) {
+                    const citationsElement     = document.createElement('div');
+                    citationsElement.className = 'citations';
+                    // Add sources section with View All button
+                    const sourcesHeader = document.createElement('div');
+                    sourcesHeader.className = 'citations-header';
+                    sourcesHeader.innerHTML = `
+                        <button class="view-all-citations-btn">View Sources</button>
+                    `;
+                    citationsElement.appendChild(sourcesHeader);
+                    messageElement.appendChild(citationsElement);
+                    // Add view all functionality
+                    const viewAllBtn = sourcesHeader.querySelector('.view-all-citations-btn');
+                    viewAllBtn.addEventListener('click', () => {
+                        this.showCitationsModal('Response to: ' + message, citations);
+                    });
+                }
+        } else {
+            messageElement.textContent = message;
+        }
+        this.chatMessages.appendChild(messageElement);
+        this.chatMessages.scrollTop = this.chatMessages.scrollHeight;
+    }
+    /**
+     * Clear the chat messages
+     */
+    clearChat() {
+        this.chatMessages.innerHTML = '';
+        this.addSystemMessage('Chat reset. You can now ask questions about the paper.');
+    }
+    /**
+     * Add a system message to the chat
+     * @param {string} message - The system message
+     */
+    addSystemMessage(message) {
+        const systemMessage = document.createElement('div');
+        systemMessage.className = 'system-message';
+        systemMessage.innerHTML = `<p>${message}</p>`;
+        this.chatMessages.appendChild(systemMessage);
+    }
+    /**
+     * Enable chat functionality
+     */
+    enableChat() {
+        document.getElementById('chat-input').disabled       = false;
+        document.getElementById('send-message-btn').disabled = false;
+        document.getElementById('reset-chat-btn').disabled   = false;
+    }
+    /**
+     * Disable chat functionality
+     */
+    disableChat() {
+        document.getElementById('chat-input').disabled       = true;
+        document.getElementById('send-message-btn').disabled = true;
+        document.getElementById('reset-chat-btn').disabled   = true;
+    }
+    /**
+     * Show citations in a modal
+     * @param {string} query     - The query that generated these citations
+     * @param {Object} citations - The citations object with texts, tables, and images
+     */
+    showCitationsModal(query, citations) {
+        // Cache DOM elements
+        const citationsModal    = document.getElementById('citations-modal');
+        const citationQuery     = document.getElementById('citation-query');
+        const citationsContainer = document.getElementById('citations-container');
+        // Set the query
+        citationQuery.textContent = `"${query}"`;
+        // Clear previous citations
+        citationsContainer.innerHTML = '';
+        // Add text citations
+        if (citations.texts.length > 0) {
+            const textSection = document.createElement('div');
+            textSection.className = 'citation-section';
+            textSection.innerHTML = '<h4>Text Excerpts</h4>';
+            const textList = document.createElement('div');
+            textList.className = 'citation-list';
+            citations.texts.forEach((text, index) => {
+                const textItem = document.createElement('div');
+                textItem.className = 'text-citation';
+                textItem.innerHTML = `<div class="citation-number">${index + 1}</div><div class="citation-text">${text}</div>`;
+                textList.appendChild(textItem);
+            });
+            textSection.appendChild(textList);
+            citationsContainer.appendChild(textSection);
+        }
+        // Add table citations
+        if (citations.tables.length > 0) {
+            const tableSection = document.createElement('div');
+            tableSection.className = 'citation-section';
+            tableSection.innerHTML = '<h4>Tables</h4>';
+            const tableList = document.createElement('div');
+            tableList.className = 'citation-list';
+            citations.tables.forEach((tableHtml, index) => {
+                const tableItem = document.createElement('div');
+                tableItem.className = 'table-citation';
+                // Wrap the table in a container for better responsiveness
+                tableItem.innerHTML = `
+                    <div class="citation-number">${index + 1}</div>
+                    <div class="citation-table-container">
+                        <div class="citation-table">${tableHtml}</div>
+                    </div>
+                `;
+                tableList.appendChild(tableItem);
+            });
+            tableSection.appendChild(tableList);
+            citationsContainer.appendChild(tableSection);
+        }
+        // Add image citations
+        if (citations.images.length > 0) {
+            const imageSection = document.createElement('div');
+            imageSection.className = 'citation-section';
+            imageSection.innerHTML = '<h4>Images</h4>';
+            const imageList = document.createElement('div');
+            imageList.className = 'citation-list';
+            citations.images.forEach((imageBase64, index) => {
+                const imageItem = document.createElement('div');
+                imageItem.className = 'image-citation';
+                imageItem.innerHTML = `
+                    <div class="citation-number">${index + 1}</div>
+                    <div class="citation-image-container">
+                        <img src="data:image/jpeg;base64,${imageBase64}" alt="Citation image ${index + 1}" class="responsive-image" loading="lazy">
+                    </div>
+                `;
+                imageList.appendChild(imageItem);
+            });
+            imageSection.appendChild(imageList);
+            citationsContainer.appendChild(imageSection);
+        }
+        // Show the modal
+        this.showModal(citationsModal);
+    }
+}

utils/__pycache__/setup_logger.cpython-310.pyc ADDED Viewed

Binary file (939 Bytes). View file

utils/setup_logger.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Logger setup utilities.
+"""
+import logging
+def setup_logger(name: str = __name__) -> logging.Logger:
+    """
+    Set up logger based on the module name. Ensures:
+    - No duplicate handlers are added.
+    - Propagation to the root logger is disabled.
+    - A standard formatter is applied.
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    # Remove all old handlers if they exist (prevents duplicate logs on reload)
+    if logger.hasHandlers():
+        logger.handlers.clear()
+    # Turn off propagation to the root logger
+    logger.propagate = False
+    # Create console handler
+    console_handler = logging.StreamHandler()
+    formatter       = logging.Formatter(
+        fmt     = "[%(levelname)s]\t%(name)s\t%(funcName)s\t%(message)s",
+        datefmt = "%Y-%m-%d %H:%M:%S"
+    )
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    return logger