YuITC commited on
Commit
c8e875f
·
1 Parent(s): 05877ff

Add application file

Browse files
Files changed (45) hide show
  1. .dockerignore +59 -0
  2. Dockerfile +59 -0
  3. app.py +19 -0
  4. src/__init__.py +3 -0
  5. src/__pycache__/__init__.cpython-310.pyc +0 -0
  6. src/__pycache__/api.cpython-310.pyc +0 -0
  7. src/__pycache__/config.cpython-310.pyc +0 -0
  8. src/api.py +308 -0
  9. src/config.py +43 -0
  10. src/data_extraction/__init__.py +3 -0
  11. src/data_extraction/__pycache__/__init__.cpython-310.pyc +0 -0
  12. src/data_extraction/__pycache__/extractor.cpython-310.pyc +0 -0
  13. src/data_extraction/extractor.py +51 -0
  14. src/fetcher/__init__.py +3 -0
  15. src/fetcher/__pycache__/__init__.cpython-310.pyc +0 -0
  16. src/fetcher/__pycache__/arxiv_fetcher.cpython-310.pyc +0 -0
  17. src/fetcher/arxiv_cs_subjects.json +162 -0
  18. src/fetcher/arxiv_fetcher.py +118 -0
  19. src/processors/__init__.py +3 -0
  20. src/processors/__pycache__/__init__.cpython-310.pyc +0 -0
  21. src/processors/__pycache__/image_processor.cpython-310.pyc +0 -0
  22. src/processors/__pycache__/prompts.cpython-310.pyc +0 -0
  23. src/processors/__pycache__/table_processor.cpython-310.pyc +0 -0
  24. src/processors/__pycache__/text_processor.cpython-310.pyc +0 -0
  25. src/processors/image_processor.py +64 -0
  26. src/processors/prompts.py +101 -0
  27. src/processors/table_processor.py +57 -0
  28. src/processors/text_processor.py +57 -0
  29. src/rag/__init__.py +3 -0
  30. src/rag/__pycache__/__init__.cpython-310.pyc +0 -0
  31. src/rag/__pycache__/pipeline.cpython-310.pyc +0 -0
  32. src/rag/pipeline.py +149 -0
  33. src/storage/__init__.py +3 -0
  34. src/storage/__pycache__/__init__.cpython-310.pyc +0 -0
  35. src/storage/__pycache__/vectorstore.cpython-310.pyc +0 -0
  36. src/storage/vectorstore.py +151 -0
  37. static/css/modern-styles.css +1177 -0
  38. static/data/arxiv_cs_subjects.json +162 -0
  39. static/index.html +304 -0
  40. static/js/api.js +219 -0
  41. static/js/chat.js +198 -0
  42. static/js/main.js +196 -0
  43. static/js/ui.js +364 -0
  44. utils/__pycache__/setup_logger.cpython-310.pyc +0 -0
  45. utils/setup_logger.py +32 -0
.dockerignore ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exclude files and directories from Docker build
2
+ # Version control
3
+ .git
4
+ .gitignore
5
+ .github
6
+ .gitattributes
7
+
8
+ # Python artifacts
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+ *.so
13
+ .Python
14
+ env/
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+
30
+ # Virtual environment
31
+ venv/
32
+ ENV/
33
+ env/
34
+
35
+ # IDE artifacts
36
+ .idea/
37
+ .vscode/
38
+ *.swp
39
+ *.swo
40
+
41
+ # Docker artifacts
42
+ # .dockerignore
43
+ # Dockerfile
44
+ # docker-compose.yml
45
+
46
+ # Documentation
47
+ docs/
48
+ *.md
49
+ LICENSE
50
+
51
+ # Misc
52
+ .DS_Store
53
+ **/*.log
54
+ **/.env*
55
+
56
+ # Others
57
+ *.ipynb
58
+ LICENCE
59
+ assets/
Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim as builder
2
+
3
+ WORKDIR /app
4
+
5
+ # Install build dependencies and runtime dependencies
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ gcc \
9
+ g++ \
10
+ python3-dev \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements file
14
+ COPY requirements.txt .
15
+
16
+ # Install dependencies
17
+ RUN pip install --no-cache-dir --upgrade pip && \
18
+ pip install --no-cache-dir -r requirements.txt
19
+ # pip install --no-cache-dir --user -r requirements.txt
20
+
21
+ # Second stage: runtime image
22
+ FROM python:3.10-slim
23
+
24
+ WORKDIR /app
25
+
26
+ # Install runtime dependencies
27
+ RUN apt-get update && apt-get install -y --no-install-recommends \
28
+ poppler-utils \
29
+ tesseract-ocr \
30
+ libreoffice \
31
+ && rm -rf /var/lib/apt/lists/*
32
+
33
+ # Copy the installed packages from the builder stage
34
+ COPY --from=builder /root/.local /root/.local
35
+
36
+ # Make sure scripts in .local are usable:
37
+ ENV PATH=/root/.local/bin:$PATH
38
+
39
+ # Copy application code
40
+ COPY . .
41
+
42
+ # Create necessary directories
43
+ RUN mkdir -p /app/data/temp
44
+
45
+ # Set environment variables
46
+ ENV PYTHONDONTWRITEBYTECODE=1 \
47
+ PYTHONUNBUFFERED=1 \
48
+ PORT=8000
49
+
50
+ # Expose the port
51
+ EXPOSE 8000
52
+
53
+ # Set a non-root user
54
+ RUN useradd -m appuser
55
+ RUN chown -R appuser:appuser /app
56
+ USER appuser
57
+
58
+ # Run the application
59
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main entry point for the arXivCSRAG application.
3
+ """
4
+ import os
5
+ import uvicorn
6
+ from dotenv import load_dotenv
7
+ from utils.setup_logger import setup_logger
8
+
9
+ load_dotenv()
10
+ logger = setup_logger(__name__)
11
+
12
+
13
+ if __name__ == '__main__':
14
+ from src.api import app
15
+
16
+ port = int(os.environ.get('PORT', 8000))
17
+ logger.info(f"Starting arXivCSRAG Application on port {port}...")
18
+
19
+ uvicorn.run(app, host='0.0.0.0', port=port)
src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ arXivCSRAG Application source code.
3
+ """
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (215 Bytes). View file
 
src/__pycache__/api.cpython-310.pyc ADDED
Binary file (8.8 kB). View file
 
src/__pycache__/config.cpython-310.pyc ADDED
Binary file (1.08 kB). View file
 
src/api.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI backend for the arXivCSRAG application.
3
+ """
4
+ import os
5
+ from typing import List, Optional
6
+ from pathlib import Path
7
+ from datetime import datetime
8
+ from pydantic import BaseModel
9
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
10
+ from fastapi.staticfiles import StaticFiles
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+
13
+ from utils.setup_logger import setup_logger
14
+ from src.config import TEMP_DIR, ROOT_DIR
15
+ from src.fetcher.arxiv_fetcher import ArxivFetcher
16
+ from src.data_extraction.extractor import extract_from_pdf, separate_content_types
17
+ from src.processors.text_processor import TextProcessor
18
+ from src.processors.table_processor import TableProcessor
19
+ from src.processors.image_processor import ImageProcessor
20
+ from src.storage.vectorstore import VectorStore
21
+ from src.rag.pipeline import RAGPipeline
22
+
23
+
24
+ # Configure logging
25
+ logger = setup_logger(__name__)
26
+
27
+
28
+ # Initialize the FastAPI app
29
+ app = FastAPI(
30
+ title = 'arXivCSRAG API',
31
+ description = 'API for the arXivCSRAG Multimodal RAG Application',
32
+ version = '1.0.0',
33
+ )
34
+
35
+
36
+ # CORS configuration
37
+ app.add_middleware(
38
+ CORSMiddleware,
39
+ allow_origins = ['*'],
40
+ allow_credentials = True,
41
+ allow_methods = ['*'],
42
+ allow_headers = ['*'],
43
+ )
44
+
45
+
46
+ # Models
47
+ class APIKeys(BaseModel):
48
+ gemini_api_key : str
49
+ huggingface_token: str
50
+
51
+ class SearchQuery(BaseModel):
52
+ subject_tags: Optional[List[str]] = None
53
+ start_date : Optional[str] = None
54
+ end_date : Optional[str] = None
55
+ max_results : int = 10
56
+ query : str
57
+
58
+ class PaperID(BaseModel):
59
+ arxiv_id: str
60
+
61
+ class ChatMessage(BaseModel):
62
+ message: str
63
+
64
+
65
+ # Initialize components
66
+ arxiv_fetcher = ArxivFetcher()
67
+ text_processor = TextProcessor()
68
+ table_processor = TableProcessor()
69
+ image_processor = ImageProcessor()
70
+ vector_store = VectorStore()
71
+ rag_pipeline = RAGPipeline(vector_store.retriever)
72
+
73
+
74
+ # API endpoints
75
+ @app.post('/api/configure')
76
+ async def configure_api_keys(api_keys: APIKeys):
77
+ """Configure API keys for the application."""
78
+ try:
79
+ # Set environment variables
80
+ os.environ['GOOGLE_API_KEY'] = api_keys.gemini_api_key
81
+ os.environ['HF_TOKEN'] = api_keys.huggingface_token
82
+
83
+ logger.info('API keys configured successfully')
84
+
85
+ return {'status' : 'success',
86
+ 'message': 'API keys configured successfully'}
87
+
88
+ except Exception as e:
89
+ logger.error(f"Error configuring API keys: {e}")
90
+ raise HTTPException(status_code=500, detail=str(e))
91
+
92
+
93
+ @app.post('/api/fetch-papers')
94
+ async def fetch_papers(search_query: SearchQuery):
95
+ """Fetch papers from arXiv based on search query and filters."""
96
+ try:
97
+ papers = arxiv_fetcher.fetch_papers(
98
+ subject_tags = search_query.subject_tags,
99
+ start_date = search_query.start_date,
100
+ end_date = search_query.end_date,
101
+ max_results = search_query.max_results,
102
+ query = search_query.query
103
+ )
104
+
105
+ logger.info(f"Fetched {len(papers)} papers")
106
+
107
+ return {'status': 'success', 'papers': papers}
108
+
109
+ except Exception as e:
110
+ logger.error(f"Error fetching papers: {e}")
111
+ raise HTTPException(status_code=500, detail=str(e))
112
+
113
+
114
+ @app.post('/api/paper-metadata')
115
+ async def get_paper_metadata(paper_id: PaperID):
116
+ """Get metadata for a specific paper."""
117
+ try:
118
+ search = arxiv_fetcher.fetch_papers(f"id:{paper_id.arxiv_id}", max_results=1)
119
+ if not search:
120
+ raise HTTPException(status_code=404, detail='Paper not found')
121
+
122
+ return {'status': 'success', 'metadata': search[0]}
123
+
124
+ except Exception as e:
125
+ logger.error(f"Error getting paper metadata: {e}")
126
+ raise HTTPException(status_code=500, detail=str(e))
127
+
128
+
129
+ @app.post('/api/download-paper')
130
+ async def download_paper(paper_id: PaperID):
131
+ """Download a paper's PDF from arXiv."""
132
+ try:
133
+ pdf_path = arxiv_fetcher.download_paper(paper_id.arxiv_id)
134
+
135
+ if not pdf_path:
136
+ raise HTTPException(status_code=404, detail="Failed to download paper")
137
+
138
+ logger.info(f"Downloaded paper {paper_id.arxiv_id} to {pdf_path}")
139
+
140
+ return {'status': 'success', 'file_path': str(pdf_path)}
141
+
142
+ except Exception as e:
143
+ logger.error(f"Error downloading paper: {e}")
144
+ raise HTTPException(status_code=500, detail=str(e))
145
+
146
+
147
+ @app.post('/api/upload-paper')
148
+ async def upload_paper(file: UploadFile = File(...)):
149
+ """Upload a paper's PDF file."""
150
+ try:
151
+ # Create a unique filename
152
+ timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
153
+ filename = f"uploaded_{timestamp}.pdf"
154
+ filepath = TEMP_DIR / filename
155
+
156
+ # Save the uploaded file
157
+ with open(filepath, 'wb') as f:
158
+ f.write(await file.read())
159
+ logger.info(f"Uploaded paper saved at {filepath}")
160
+
161
+ return {'status': 'success', 'file_path': str(filepath)}
162
+
163
+ except Exception as e:
164
+ logger.error(f"Error uploading paper: {e}")
165
+ raise HTTPException(status_code=500, detail=str(e))
166
+
167
+
168
+ @app.post('/api/process-paper')
169
+ async def process_paper(file_path: str = Form(...)):
170
+ """Process a paper for RAG."""
171
+ try:
172
+ # # Reset the vector store
173
+ # vector_store.reset()
174
+
175
+ # # Set the new retriever for the RAG pipeline
176
+ # rag_pipeline.retriever = vector_store.retriever
177
+
178
+ # Process the paper
179
+ pdf_path = Path(file_path)
180
+ logger.info(f"Processing paper at {pdf_path}")
181
+ if not pdf_path.exists():
182
+ raise HTTPException(status_code=404, detail='PDF file not found')
183
+
184
+ # Extract content from PDF
185
+ logger.info(f"Extracting content from {pdf_path}")
186
+ chunks = extract_from_pdf(pdf_path)
187
+
188
+ # Separate content types
189
+ logger.info(f"Separating content types from {len(chunks)} chunks")
190
+ content = separate_content_types(chunks)
191
+
192
+
193
+ # Process and summarize content
194
+ logger.info(f"Processing {len(content['texts'])} text content")
195
+ text_summaries = text_processor.process(content['texts'])
196
+
197
+ logger.info(f"Processing {len(content['tables'])} table content")
198
+ table_summaries = table_processor.process(content['tables'])
199
+
200
+ logger.info(f"Processing {len(content['images'])} image content")
201
+ image_summaries = image_processor.process(content['images'])
202
+
203
+
204
+ # Add to vector store
205
+ logger.info("Adding processed content to vector store")
206
+ vector_store.add_contents(
207
+ content['texts'] , text_summaries,
208
+ content['tables'], table_summaries,
209
+ content['images'], image_summaries
210
+ )
211
+
212
+ logger.info(f"Processed paper {pdf_path.name} successfully")
213
+ return {
214
+ 'status': 'success',
215
+ 'stats' : {
216
+ 'texts' : len(content['texts']),
217
+ 'tables': len(content['tables']),
218
+ 'images': len(content['images'])
219
+ }
220
+ }
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error processing paper: {e}")
224
+ raise HTTPException(status_code=500, detail=str(e))
225
+
226
+
227
+ @app.post('/api/chat')
228
+ async def chat_with_paper(message: ChatMessage):
229
+ """
230
+ Chat with a processed paper.
231
+
232
+ Returns:
233
+ - status: success or error
234
+ - response: The generated text response
235
+ - citations: Dictionary containing three keys:
236
+ - texts: List of text excerpts used as citations
237
+ - images: List of base64-encoded image strings
238
+ - tables: List of HTML-formatted table strings
239
+ """
240
+ try:
241
+ rag_pipeline.retriever = vector_store.retriever
242
+
243
+ # Query the RAG pipeline
244
+ logger.info(f"Chatting with paper: {message.message}")
245
+ response = rag_pipeline.query(message.message)
246
+
247
+ # Get the retrieved documents
248
+ retrieved_docs = vector_store.retrieve(message.message)
249
+ parsed_docs = rag_pipeline.parse_docs(retrieved_docs)
250
+
251
+ return {
252
+ 'status' : 'success',
253
+ 'response' : response['response'],
254
+ 'citations': parsed_docs
255
+ }
256
+ except Exception as e:
257
+ logger.error(f"Error chatting with paper: {e}")
258
+ raise HTTPException(status_code=500, detail=str(e))
259
+
260
+
261
+ @app.post('/api/fetch-citations')
262
+ async def fetch_citations(message: ChatMessage):
263
+ """
264
+ Fetch citations for a specific query without generating a response.
265
+
266
+ This is useful for retrieving only the source documents that would be used
267
+ to answer a query without generating the complete answer.
268
+
269
+ Returns:
270
+ - status: success or error
271
+ - citations: Dictionary containing three keys:
272
+ - texts: List of text excerpts used as citations
273
+ - images: List of base64-encoded image strings
274
+ - tables: List of HTML-formatted table strings
275
+ """
276
+ try:
277
+ # Get the retrieved documents
278
+ retrieved_docs = vector_store.retrieve(message.message)
279
+ parsed_docs = rag_pipeline.parse_docs(retrieved_docs)
280
+
281
+ return {
282
+ 'status' : 'success',
283
+ 'citations': parsed_docs
284
+ }
285
+ except Exception as e:
286
+ logger.error(f"Error fetching citations: {e}")
287
+ raise HTTPException(status_code=500, detail=str(e))
288
+
289
+
290
+ @app.post('/api/reset-chat')
291
+ async def reset_chat():
292
+ """Reset the chat and vector store."""
293
+ try:
294
+ logger.info("Resetting chat and vector store")
295
+ vector_store.reset()
296
+ rag_pipeline.retriever = vector_store.retriever
297
+ rag_pipeline.reset_memory()
298
+ return {'status': 'success', 'message': 'Chat reset successfully'}
299
+
300
+ except Exception as e:
301
+ logger.error(f"Error resetting chat: {e}")
302
+ raise HTTPException(status_code=500, detail=str(e))
303
+
304
+
305
+ # Serve static files
306
+ app.mount('/static', StaticFiles(directory=ROOT_DIR / 'static', html=False), name='static')
307
+ app.mount('/data' , StaticFiles(directory=ROOT_DIR / 'static/data') , name='data')
308
+ app.mount('/' , StaticFiles(directory=ROOT_DIR / 'static', html=True) , name='root')
src/config.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for the arXivCSRAG application.
3
+ """
4
+ import os
5
+ import torch
6
+ from pathlib import Path
7
+ from dotenv import load_dotenv
8
+ from huggingface_hub import whoami
9
+
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+ user = whoami(token=os.getenv('HF_TOKEN'))
14
+
15
+
16
+ # Base paths
17
+ ROOT_DIR = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18
+ TEMP_DIR = ROOT_DIR / 'temp'
19
+ if not TEMP_DIR.exists(): TEMP_DIR.mkdir(parents=True, exist_ok=True)
20
+
21
+
22
+ # PDF Extraction Configuration
23
+ PDF_EXTRACTION_CONFIG = {
24
+ 'infer_table_structure' : True,
25
+ 'strategy' : 'hi_res',
26
+ 'extract_image_block_types' : ['Image'],
27
+ 'extract_image_block_to_payload': True,
28
+ 'chunking_strategy' : 'by_title',
29
+ 'max_characters' : 10000,
30
+ 'combine_text_under_n_chars' : 2000,
31
+ 'new_after_n_chars' : 6000
32
+ }
33
+
34
+
35
+ # LLM & Embedding model Configuration
36
+ MODEL_NAME = 'gemini-2.5-flash-lite-preview-06-17'
37
+ # EMBEDDING_MODEL = 'BAAI/bge-base-en-v1.5'
38
+ EMBEDDING_MODEL = 'BAAI/bge-m3' # Multi-lingual BGE model
39
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
40
+
41
+
42
+ # Vector Store Configuration
43
+ COLLECTION_NAME = 'arXiv_CS_RAG'
src/data_extraction/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Data extraction module for PDF documents.
3
+ """
src/data_extraction/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (237 Bytes). View file
 
src/data_extraction/__pycache__/extractor.cpython-310.pyc ADDED
Binary file (1.76 kB). View file
 
src/data_extraction/extractor.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF document extraction utilities.
3
+ """
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any, Union
6
+ from unstructured.partition.pdf import partition_pdf
7
+
8
+ from src.config import PDF_EXTRACTION_CONFIG
9
+
10
+
11
+ def extract_from_pdf(pdf_path: Union[str, Path]) -> List[Any]:
12
+ """
13
+ Extract content from a PDF file using unstructured.
14
+
15
+ Args:
16
+ pdf_path (Union[str, Path]): Path to the PDF file
17
+
18
+ Returns:
19
+ List[Any]: List of extracted elements (text, tables, images)
20
+ """
21
+ pdf_path = Path(pdf_path) if isinstance(pdf_path, str) else pdf_path
22
+ if not pdf_path.exists():
23
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
24
+
25
+ chunks = partition_pdf(filename=pdf_path, **PDF_EXTRACTION_CONFIG)
26
+ return chunks
27
+
28
+
29
+ def separate_content_types(chunks: List[Any]) -> Dict[str, List[Any]]:
30
+ """
31
+ Separate the extracted content into text, images, and tables.
32
+
33
+ Args:
34
+ chunks (List[Any]): List of extracted elements from the PDF
35
+
36
+ Returns:
37
+ Dict[str, List[Any]]: Dictionary with keys 'texts', 'images', 'tables'
38
+ """
39
+ texts, images, tables = [], [], []
40
+ for chunk in chunks:
41
+ if type(chunk).__name__ == 'Table': tables.append(chunk)
42
+ elif type(chunk).__name__ == 'Image': images.append(chunk)
43
+
44
+ elif type(chunk).__name__ == 'CompositeElement':
45
+ texts.append(chunk)
46
+
47
+ for element in chunk.metadata.orig_elements:
48
+ if type(element).__name__ == 'Image': images.append(element)
49
+ elif type(element).__name__ == 'Table': tables.append(element)
50
+
51
+ return {'texts': texts, 'images': images, 'tables': tables}
src/fetcher/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ arXiv fetcher utilities module.
3
+ """
src/fetcher/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (219 Bytes). View file
 
src/fetcher/__pycache__/arxiv_fetcher.cpython-310.pyc ADDED
Binary file (3.88 kB). View file
 
src/fetcher/arxiv_cs_subjects.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "tag": "cs.AI",
4
+ "name": "Artificial Intelligence"
5
+ },
6
+ {
7
+ "tag": "cs.CL",
8
+ "name": "Computation and Language"
9
+ },
10
+ {
11
+ "tag": "cs.CC",
12
+ "name": "Computational Complexity"
13
+ },
14
+ {
15
+ "tag": "cs.CE",
16
+ "name": "Computational Engineering, Finance, and Science"
17
+ },
18
+ {
19
+ "tag": "cs.CG",
20
+ "name": "Computational Geometry"
21
+ },
22
+ {
23
+ "tag": "cs.GT",
24
+ "name": "Computer Science and Game Theory"
25
+ },
26
+ {
27
+ "tag": "cs.CV",
28
+ "name": "Computer Vision and Pattern Recognition"
29
+ },
30
+ {
31
+ "tag": "cs.CY",
32
+ "name": "Computers and Society"
33
+ },
34
+ {
35
+ "tag": "cs.CR",
36
+ "name": "Cryptography and Security"
37
+ },
38
+ {
39
+ "tag": "cs.DS",
40
+ "name": "Data Structures and Algorithms"
41
+ },
42
+ {
43
+ "tag": "cs.DB",
44
+ "name": "Databases"
45
+ },
46
+ {
47
+ "tag": "cs.DL",
48
+ "name": "Digital Libraries"
49
+ },
50
+ {
51
+ "tag": "cs.DM",
52
+ "name": "Discrete Mathematics"
53
+ },
54
+ {
55
+ "tag": "cs.DC",
56
+ "name": "Distributed, Parallel, and Cluster Computing"
57
+ },
58
+ {
59
+ "tag": "cs.ET",
60
+ "name": "Emerging Technologies"
61
+ },
62
+ {
63
+ "tag": "cs.FL",
64
+ "name": "Formal Languages and Automata Theory"
65
+ },
66
+ {
67
+ "tag": "cs.GL",
68
+ "name": "General Literature"
69
+ },
70
+ {
71
+ "tag": "cs.GR",
72
+ "name": "Graphics"
73
+ },
74
+ {
75
+ "tag": "cs.AR",
76
+ "name": "Hardware Architecture"
77
+ },
78
+ {
79
+ "tag": "cs.HC",
80
+ "name": "Human-Computer Interaction"
81
+ },
82
+ {
83
+ "tag": "cs.IR",
84
+ "name": "Information Retrieval"
85
+ },
86
+ {
87
+ "tag": "cs.IT",
88
+ "name": "Information Theory"
89
+ },
90
+ {
91
+ "tag": "cs.LO",
92
+ "name": "Logic in Computer Science"
93
+ },
94
+ {
95
+ "tag": "cs.LG",
96
+ "name": "Machine Learning"
97
+ },
98
+ {
99
+ "tag": "cs.MS",
100
+ "name": "Mathematical Software"
101
+ },
102
+ {
103
+ "tag": "cs.MA",
104
+ "name": "Multiagent Systems"
105
+ },
106
+ {
107
+ "tag": "cs.MM",
108
+ "name": "Multimedia"
109
+ },
110
+ {
111
+ "tag": "cs.NI",
112
+ "name": "Networking and Internet Architecture"
113
+ },
114
+ {
115
+ "tag": "cs.NE",
116
+ "name": "Neural and Evolutionary Computing"
117
+ },
118
+ {
119
+ "tag": "cs.NA",
120
+ "name": "Numerical Analysis"
121
+ },
122
+ {
123
+ "tag": "cs.OS",
124
+ "name": "Operating Systems"
125
+ },
126
+ {
127
+ "tag": "cs.OH",
128
+ "name": "Other Computer Science"
129
+ },
130
+ {
131
+ "tag": "cs.PF",
132
+ "name": "Performance"
133
+ },
134
+ {
135
+ "tag": "cs.PL",
136
+ "name": "Programming Languages"
137
+ },
138
+ {
139
+ "tag": "cs.RO",
140
+ "name": "Robotics"
141
+ },
142
+ {
143
+ "tag": "cs.SI",
144
+ "name": "Social and Information Networks"
145
+ },
146
+ {
147
+ "tag": "cs.SE",
148
+ "name": "Software Engineering"
149
+ },
150
+ {
151
+ "tag": "cs.SD",
152
+ "name": "Sound"
153
+ },
154
+ {
155
+ "tag": "cs.SC",
156
+ "name": "Symbolic Computation"
157
+ },
158
+ {
159
+ "tag": "cs.SY",
160
+ "name": "Systems and Control"
161
+ }
162
+ ]
src/fetcher/arxiv_fetcher.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import arxiv
2
+ import urllib.request
3
+ from pathlib import Path
4
+ from dateutil import parser
5
+ from typing import List, Dict, Any, Optional
6
+
7
+ from utils.setup_logger import setup_logger
8
+ from src.config import TEMP_DIR
9
+
10
+ # Configure logging
11
+ logger = setup_logger(__name__)
12
+
13
+
14
+ class ArxivFetcher:
15
+ def __init__(self):
16
+ self.client = arxiv.Client()
17
+
18
+
19
+ def fetch_papers(self,
20
+ subject_tags: List[str] = None,
21
+ start_date : str = None,
22
+ end_date : str = None,
23
+ max_results : int = 10,
24
+ query : str = None) -> List[Dict[str, Any]]:
25
+ """
26
+ Fetches papers from arXiv based on subject tags and date range.
27
+
28
+ Args:
29
+ subject_tags (list): List of subject tags to filter papers by
30
+ start_date (str): Start date in YYYY-MM-DD format
31
+ end_date (str): End date in YYYY-MM-DD format
32
+ query (str): Search query for text-based search
33
+ max_results (int): Maximum number of results to return
34
+
35
+ Returns:
36
+ list: List of paper dictionaries with metadata
37
+ """
38
+ # Search query
39
+ if not subject_tags: filter_query = 'cat:cs.*' # Default to all CS tags
40
+ else : filter_query = ' OR '.join([f"cat:{tag}" for tag in subject_tags]) # Query with selected tags
41
+
42
+ if not query: search_query = ''
43
+ else : search_query = ' AND (' + ' AND '.join([f"(ti:{q} OR abs:{q})" for q in query.split()]) + ')' # Search by title or abstract
44
+
45
+ final_query = f"({filter_query}){search_query}"
46
+ logger.info(f"Fetching papers with query: {final_query}")
47
+
48
+ # Search object
49
+ search = arxiv.Search(
50
+ query = final_query,
51
+ max_results = max_results,
52
+ sort_by = arxiv.SortCriterion.SubmittedDate
53
+ )
54
+
55
+ try:
56
+ results = list(self.client.results(search))
57
+
58
+ # Filter by date
59
+ if start_date or end_date:
60
+ filtered_results = []
61
+ start_date_obj = parser.parse(start_date).date() if start_date else None
62
+ end_date_obj = parser.parse(end_date).date() if end_date else None
63
+
64
+ for paper in results:
65
+ paper_date = paper.published.date()
66
+ if start_date_obj and paper_date < start_date_obj: continue
67
+ if end_date_obj and paper_date > end_date_obj : continue
68
+
69
+ filtered_results.append(paper)
70
+ results = filtered_results
71
+
72
+ # Convert to dictionary format with required metadata
73
+ papers = []
74
+ for paper in results:
75
+ papers.append({
76
+ 'title' : paper.title,
77
+ 'authors' : [author.name for author in paper.authors],
78
+ 'published' : paper.published.strftime('%Y-%m-%d'),
79
+ 'updated' : paper.updated.strftime('%Y-%m-%d') if paper.updated else None,
80
+ 'arxiv_id' : paper.get_short_id(),
81
+ 'pdf_url' : paper.pdf_url,
82
+ 'entry_id' : paper.entry_id,
83
+ 'abstract' : paper.summary,
84
+ 'categories' : paper.categories,
85
+ 'primary_category': paper.primary_category
86
+ })
87
+ return papers
88
+
89
+ except Exception as e:
90
+ print(f"Error fetching papers: {e}")
91
+ return []
92
+
93
+
94
+ def download_paper(self, paper_id: str) -> Optional[Path]:
95
+ """
96
+ Downloads a paper's PDF from arXiv.
97
+
98
+ Args:
99
+ paper_id (str): The arXiv ID of the paper
100
+
101
+ Returns:
102
+ Optional[Path]: Path to the downloaded PDF file, or None if download failed
103
+ """
104
+ try:
105
+ # Create the filename
106
+ filename = f"{paper_id.replace('/', '_')}.pdf"
107
+ filepath = TEMP_DIR / filename
108
+ if filepath.exists():
109
+ return filepath
110
+
111
+ # Download the PDF
112
+ pdf_url = f"https://arxiv.org/pdf/{paper_id}"
113
+ urllib.request.urlretrieve(pdf_url, filepath)
114
+ return filepath
115
+
116
+ except Exception as e:
117
+ print(f"Error downloading paper {paper_id}: {e}")
118
+ return None
src/processors/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Content processors module for text, tables, and images.
3
+ """
src/processors/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (246 Bytes). View file
 
src/processors/__pycache__/image_processor.cpython-310.pyc ADDED
Binary file (2.21 kB). View file
 
src/processors/__pycache__/prompts.cpython-310.pyc ADDED
Binary file (5.96 kB). View file
 
src/processors/__pycache__/table_processor.cpython-310.pyc ADDED
Binary file (2.22 kB). View file
 
src/processors/__pycache__/text_processor.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
src/processors/image_processor.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Image content processor for summarization.
3
+ """
4
+ from typing import List, Any, Callable
5
+
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from langchain_core.output_parsers import StrOutputParser
9
+
10
+ from src.config import MODEL_NAME
11
+ from src.processors.prompts import IMAGE_SUMMARY_PROMPT
12
+
13
+
14
+ class ImageProcessor:
15
+ """Image content processor for summarization."""
16
+
17
+ def __init__(self, model_name: str = MODEL_NAME):
18
+ """
19
+ Initialize the image processor.
20
+
21
+ Args:
22
+ model_name (str): Name of the LLM model to use
23
+ """
24
+ self.llm = ChatGoogleGenerativeAI(model=model_name)
25
+ self.chain = self._create_summary_chain()
26
+
27
+
28
+ def _create_summary_chain(self) -> Callable:
29
+ """
30
+ Create the image summarization chain.
31
+
32
+ Returns:
33
+ Callable: The image summarization chain
34
+ """
35
+ messages = [(
36
+ 'user',
37
+ [
38
+ {'type': 'text' , 'text': IMAGE_SUMMARY_PROMPT},
39
+ {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,{image}'}}
40
+ ]
41
+ )]
42
+
43
+ return (
44
+ ChatPromptTemplate.from_messages(messages)
45
+ | self.llm
46
+ | StrOutputParser()
47
+ )
48
+
49
+
50
+ def process(self, images: List[Any]) -> List[str]:
51
+ """
52
+ Process and summarize image elements.
53
+
54
+ Args:
55
+ images (List[Any]): List of image elements to summarize
56
+
57
+ Returns:
58
+ List[str]: List of image summaries
59
+ """
60
+ summaries = []
61
+ for image in images:
62
+ summary = self.chain.invoke({'image': image.metadata.image_base64})
63
+ summaries.append(summary)
64
+ return summaries
src/processors/prompts.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prompt templates for the LLM processors.
3
+ """
4
+
5
+
6
+ # Text summarization prompt
7
+ TEXT_SUMMARY_PROMPT = """
8
+ # ROLE
9
+ You are a highly specialized text processing engine. Your only function is to describe and summarize text.
10
+
11
+ # INSTRUCTIONS
12
+ You will be given a piece of text as input. Your task is to:
13
+ 1. Identify the main subject and purpose of the text (description).
14
+ 2. Extract and synthesize the most important points and key ideas (summary).
15
+ 3. Combine these into a single, cohesive response.
16
+
17
+ # STRICT RULES
18
+ - **DO NOT** use any conversational language, introductions, or concluding remarks. (e.g., "Here is a summary:", "The text discusses...", "In summary...", "The text describes...").
19
+ - **DO NOT** refer to yourself, your instructions, or your role as an AI.
20
+ - **DO NOT** add any information, examples, or opinions not found directly in the input text.
21
+ - **DO NOT** apologize, express uncertainty, or ask for clarification.
22
+ - Your output **MUST** be limited strictly to the description and summary of the text. There should be no other text, characters, or formatting.
23
+
24
+ # TASK
25
+ Analyze the following input text and generate ONLY the description and summary of its key ideas. Do not include any additional text, explanations, or formatting.
26
+
27
+ --- INPUT TEXT ---
28
+ {text}
29
+
30
+ --- OUTPUT TEXT ---
31
+ """
32
+
33
+
34
+ # Table summarization prompt
35
+ TABLE_SUMMARY_PROMPT = """
36
+ # ROLE
37
+ You are a highly specialized data processing engine. Your only function is to interpret and summarize data from tables.
38
+
39
+ # INSTRUCTIONS
40
+ You will be given a table in HTML format as input. Your task is to:
41
+ 1. Interpret the HTML structure (e.g., `<table>`, `<th>`, `<tr>`, `<td>`) to understand the data's organization, columns, and rows.
42
+ 2. Identify the main subject of the table. What data is it presenting? (This is the description).
43
+ 3. Summarize the key insights, trends, or significant relationships presented in the data. (This is the summary). **DO NOT** simply list the data row by row.
44
+ 4. Combine these into a single, cohesive text response.
45
+
46
+ # STRICT RULES
47
+ - **DO NOT** use any conversational language, introductions, or concluding remarks. (e.g., "Here is a summary:", "The table discusses...", "In summary...", "The table describes...").
48
+ - **DO NOT** refer to yourself, your instructions, or your role as an AI.
49
+ - **DO NOT** add any information or interpretations not directly supported by the data in the table.
50
+ - **DO NOT** apologize, express uncertainty, or ask for clarification.
51
+ - Your output **MUST** be limited strictly to the description and summary of the table's data. There should be no other text, characters, or formatting.
52
+
53
+ # TASK
54
+ Analyze the following HTML table and generate ONLY the description and summary of its key ideas and data. Do not include any additional text, explanations, or formatting.
55
+
56
+ --- INPUT TABLE (HTML) ---
57
+ {table}
58
+
59
+ --- OUTPUT TEXT ---
60
+ """
61
+
62
+
63
+ # Image summarization prompt
64
+ IMAGE_SUMMARY_PROMPT = """
65
+ # ROLE
66
+ You are a highly specialized Visual Analysis Engine. Your sole function is to analyze the provided image and extract all relevant information into a concise text description.
67
+
68
+ # INSTRUCTIONS
69
+ You will be given an image as input. Your task is to:
70
+ 1. First, identify the type of image (e.g., photograph, bar chart, line graph, diagram, flowchart).
71
+ 2. Based on the image type, extract all key visual and textual information:
72
+ - **For charts or graphs:** Identify the title, axis labels (X and Y), units, and legend. Summarize the data trends, key values (highs, lows, significant points), and the primary relationship the data illustrates.
73
+ - **For diagrams or flowcharts:** Identify all components, labels, and connectors (like arrows). Describe the process, hierarchy, or system being shown from start to finish.
74
+ - **For photographs or scenes:** Describe the main subject(s), objects, setting/environment, any visible text, and the key actions taking place.
75
+ 3. Synthesize all extracted information into a single, comprehensive summary. The goal is to create a text-based representation of the image that captures all its important content.
76
+
77
+ # STRICT RULES
78
+ - **DO NOT** use any conversational language, introductions, or concluding remarks (e.g., "The image shows...", "In this picture...").
79
+ - **DO NOT** refer to yourself, your instructions, or your role as an AI.
80
+ - **DO NOT** speculate or infer information beyond what is visually present. Do not guess emotions, intentions, or events happening outside the frame unless explicitly supported by visual cues.
81
+ - **DO NOT** express personal opinions or make subjective aesthetic judgments about the image.
82
+ - **DO NOT** apologize, express uncertainty, or ask for clarification.
83
+ - Your output **MUST** be a single block of text containing only the comprehensive description and summary. There should be no other text, characters, or formatting.
84
+
85
+ # TASK
86
+ Analyze the following image and generate ONLY a comprehensive text description of its content and key data. Do not include any additional text, explanations, or formatting.
87
+ """
88
+
89
+
90
+ # RAG system message
91
+ RAG_SYSTEM_MESSAGE = """
92
+ You are a helpful AI assistant with conversational memory. Your task is to answer the user's question based on the provided context and your memory of the conversation history.
93
+
94
+ - If the information needed to answer the question is not in the context, you MUST respond with the exact phrase: `Sorry 🥹, I don't have enough information to answer this question.` or 'Xin lỗi 🥹, tôi không đủ thông tin để trả lời câu hỏi này.', based on the language of the question.
95
+ - You should use the conversation history to provide more coherent and contextually relevant responses.
96
+ - When referencing previous exchanges, do so naturally within your response.
97
+ - Your entire answer must be grounded in the provided text and conversation history.
98
+ - Format your response in Markdown.
99
+
100
+ Below is the context provided to you:
101
+ """
src/processors/table_processor.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Table content processor for summarization.
3
+ """
4
+ from typing import List, Any, Callable
5
+
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from langchain_core.output_parsers import StrOutputParser
9
+
10
+ from src.config import MODEL_NAME
11
+ from src.processors.prompts import TABLE_SUMMARY_PROMPT
12
+
13
+
14
+ class TableProcessor:
15
+ """Table content processor for summarization."""
16
+
17
+ def __init__(self, model_name: str = MODEL_NAME):
18
+ """
19
+ Initialize the table processor.
20
+
21
+ Args:
22
+ model_name (str): Name of the LLM model to use
23
+ """
24
+ self.llm = ChatGoogleGenerativeAI(model=model_name)
25
+ self.chain = self._create_summary_chain()
26
+
27
+
28
+ def _create_summary_chain(self) -> Callable:
29
+ """
30
+ Create the table summarization chain.
31
+
32
+ Returns:
33
+ Callable: The table summarization chain
34
+ """
35
+ return (
36
+ {'table': lambda x: x}
37
+ | ChatPromptTemplate.from_template(TABLE_SUMMARY_PROMPT)
38
+ | self.llm
39
+ | StrOutputParser()
40
+ )
41
+
42
+
43
+ def process(self, tables: List[Any]) -> List[str]:
44
+ """
45
+ Process and summarize table elements.
46
+
47
+ Args:
48
+ tables (List[Any]): List of table elements to summarize
49
+
50
+ Returns:
51
+ List[str]: List of table summaries
52
+ """
53
+ summaries = []
54
+ for table in tables:
55
+ summary = self.chain.invoke(table.metadata.text_as_html)
56
+ summaries.append(summary)
57
+ return summaries
src/processors/text_processor.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text content processor for summarization.
3
+ """
4
+ from typing import List, Any, Callable
5
+
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from langchain_core.output_parsers import StrOutputParser
9
+
10
+ from src.config import MODEL_NAME
11
+ from src.processors.prompts import TEXT_SUMMARY_PROMPT
12
+
13
+
14
+ class TextProcessor:
15
+ """Text content processor for summarization."""
16
+
17
+ def __init__(self, model_name: str = MODEL_NAME):
18
+ """
19
+ Initialize the text processor.
20
+
21
+ Args:
22
+ model_name (str): Name of the LLM model to use
23
+ """
24
+ self.llm = ChatGoogleGenerativeAI(model=model_name)
25
+ self.chain = self._create_summary_chain()
26
+
27
+
28
+ def _create_summary_chain(self) -> Callable:
29
+ """
30
+ Create the text summarization chain.
31
+
32
+ Returns:
33
+ Callable: The text summarization chain
34
+ """
35
+ return (
36
+ {'text': lambda x: x}
37
+ | ChatPromptTemplate.from_template(TEXT_SUMMARY_PROMPT)
38
+ | self.llm
39
+ | StrOutputParser()
40
+ )
41
+
42
+
43
+ def process(self, texts: List[Any]) -> List[str]:
44
+ """
45
+ Process and summarize text elements.
46
+
47
+ Args:
48
+ texts (List[Any]): List of text elements to summarize
49
+
50
+ Returns:
51
+ List[str]: List of text summaries
52
+ """
53
+ summaries = []
54
+ for text in texts:
55
+ summary = self.chain.invoke(text.text)
56
+ summaries.append(summary)
57
+ return summaries
src/rag/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ RAG pipeline implementation module.
3
+ """
src/rag/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (219 Bytes). View file
 
src/rag/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (4.93 kB). View file
 
src/rag/pipeline.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG pipeline implementation.
3
+ """
4
+ from typing import Dict, List, Any, Callable
5
+ from operator import itemgetter
6
+
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.messages import SystemMessage, HumanMessage
10
+ from langchain_core.runnables import RunnablePassthrough, RunnableLambda
11
+ from langchain_core.output_parsers import StrOutputParser
12
+ from langchain.retrievers.multi_vector import MultiVectorRetriever
13
+ from langchain.memory.summary import ConversationSummaryMemory
14
+
15
+ from src.config import MODEL_NAME
16
+ from src.processors.prompts import RAG_SYSTEM_MESSAGE
17
+
18
+
19
+ class RAGPipeline:
20
+ """RAG pipeline implementation."""
21
+
22
+ def __init__(self, retriever: MultiVectorRetriever, model_name: str = MODEL_NAME):
23
+ """
24
+ Initialize the RAG pipeline.
25
+
26
+ Args:
27
+ retriever (MultiVectorRetriever): The document retriever
28
+ model_name (str): Name of the LLM model to use
29
+ """
30
+ self.retriever = retriever
31
+ self.llm = ChatGoogleGenerativeAI(model=model_name)
32
+ self.rag_chain = self._create_rag_chain()
33
+ self.memory = ConversationSummaryMemory(
34
+ llm=self.llm,
35
+ memory_key="chat_history",
36
+ return_messages=True,
37
+ input_key="question",
38
+ output_key="response"
39
+ )
40
+
41
+
42
+ def parse_docs(self, docs: List[Any]) -> Dict[str, List[Any]]:
43
+ """
44
+ Parse the retrieved documents into text, image, and table lists.
45
+
46
+ Args:
47
+ docs (List[Any]): List of retrieved documents
48
+
49
+ Returns:
50
+ Dict[str, List[Any]]: Dictionary with keys 'texts', 'images', 'tables'
51
+ """
52
+ parsed_texts, parsed_images, parsed_tables = [], [], []
53
+
54
+ for doc in docs:
55
+ if type(doc).__name__ == 'Table' : parsed_tables.append(doc.metadata.text_as_html)
56
+ elif type(doc).__name__ == 'Image' : parsed_images.append(doc.metadata.image_base64)
57
+ elif type(doc).__name__ == 'CompositeElement': parsed_texts.append(doc.text)
58
+
59
+ return {'texts': parsed_texts, 'images': parsed_images, 'tables': parsed_tables}
60
+
61
+
62
+ def _build_prompt(self, kwargs: Dict[str, Any]) -> ChatPromptTemplate:
63
+ """
64
+ Build the prompt template for the RAG query.
65
+
66
+ Args:
67
+ kwargs (Dict[str, Any]): Dictionary with keys 'context', 'question', and 'chat_history'
68
+
69
+ Returns:
70
+ ChatPromptTemplate: The chat prompt template
71
+ """
72
+ context = kwargs['context']
73
+ question = kwargs['question']
74
+ chat_history = kwargs.get('chat_history', [])
75
+
76
+ messages = [SystemMessage(content=RAG_SYSTEM_MESSAGE)]
77
+
78
+ # Add conversation history if available
79
+ if chat_history:
80
+ messages.extend(chat_history)
81
+
82
+ for txt in context['texts'] : messages.append(HumanMessage(content=[{'type': 'text', 'text': f"[TEXT]:\n{txt}"}]))
83
+ for tbl in context['tables']: messages.append(HumanMessage(content=[{'type': 'text', 'text': f"[TABLE]:\n```html\n{tbl}\n```"}]))
84
+ for img in context['images']:
85
+ messages.append(
86
+ HumanMessage(content=[{'type': 'text' , 'text': f"[IMAGE]:\n"},
87
+ {'type': 'image_url', 'image_url': {'url': f"data:image/jpeg;base64,{img}"}}])
88
+ )
89
+
90
+ messages.append(
91
+ HumanMessage(content=[{'type': 'text',
92
+ 'text': f"Based on the above contexts and our conversation history, answer the question: {question}"}])
93
+ )
94
+ return ChatPromptTemplate.from_messages(messages)
95
+
96
+
97
+ def _create_rag_chain(self) -> Callable:
98
+ """
99
+ Create the RAG chain.
100
+
101
+ Returns:
102
+ Callable: The RAG chain
103
+ """
104
+ return (
105
+ {
106
+ 'context' : itemgetter('question') | RunnableLambda(lambda q: f"query: {q}") | self.retriever | RunnableLambda(self.parse_docs),
107
+ 'question' : itemgetter('question'),
108
+ 'chat_history': itemgetter('chat_history')
109
+ }
110
+ | RunnablePassthrough().assign(
111
+ response=(
112
+ RunnableLambda(self._build_prompt)
113
+ | self.llm
114
+ | StrOutputParser()
115
+ )
116
+ )
117
+ )
118
+
119
+
120
+ def query(self, question: str) -> Dict[str, Any]:
121
+ """
122
+ Query the RAG pipeline.
123
+
124
+ Args:
125
+ question (str): The question to answer
126
+
127
+ Returns:
128
+ Dict[str, Any]: Dictionary with keys 'question', 'context', and 'response'
129
+ """
130
+ # Get chat history from memory
131
+ chat_history = self.memory.load_memory_variables({})
132
+
133
+ # Execute the query with the chat history
134
+ result = self.rag_chain.invoke({
135
+ 'question': question,
136
+ 'chat_history': chat_history.get('chat_history', [])
137
+ })
138
+
139
+ # Update memory with the new interaction
140
+ self.memory.save_context(
141
+ {"question": question},
142
+ {"response": result['response']}
143
+ )
144
+
145
+ return result
146
+
147
+ def reset_memory(self):
148
+ """Reset the conversation memory."""
149
+ self.memory.clear()
src/storage/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Vector storage and retrieval module.
3
+ """
src/storage/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (224 Bytes). View file
 
src/storage/__pycache__/vectorstore.cpython-310.pyc ADDED
Binary file (5.36 kB). View file
 
src/storage/vectorstore.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector storage and retrieval implementation.
3
+ """
4
+ import uuid
5
+ from typing import List, Any
6
+
7
+ from langchain_chroma import Chroma
8
+ from langchain.storage import InMemoryStore
9
+ from langchain.schema.document import Document
10
+ from langchain_huggingface import HuggingFaceEmbeddings
11
+ from langchain.retrievers.multi_vector import MultiVectorRetriever
12
+
13
+ from src.config import EMBEDDING_MODEL, DEVICE, COLLECTION_NAME
14
+
15
+
16
+ class VectorStore:
17
+ """Vector storage and retrieval implementation."""
18
+
19
+ def __init__(self, collection_name: str = COLLECTION_NAME, embedding_model: str = EMBEDDING_MODEL):
20
+ """
21
+ Initialize the vector store.
22
+
23
+ Args:
24
+ collection_name (str): Name of the vector store collection
25
+ embedding_model (str): Name of the embedding model to use
26
+ """
27
+ self.embedding_function = self._create_embedding_function(embedding_model)
28
+ self.vector_store = self._create_vector_store(collection_name)
29
+ self.doc_store = InMemoryStore()
30
+ self.id_key = 'doc_id'
31
+ self.retriever = self._create_retriever()
32
+
33
+
34
+ def _create_embedding_function(self, model_name: str) -> HuggingFaceEmbeddings:
35
+ """
36
+ Create an embedding function.
37
+
38
+ Args:
39
+ model_name (str): Name of the embedding model
40
+
41
+ Returns:
42
+ HuggingFaceEmbeddings: The embedding function
43
+ """
44
+ return HuggingFaceEmbeddings(
45
+ model_name = model_name,
46
+ model_kwargs = {'device': DEVICE},
47
+ encode_kwargs = {'normalize_embeddings': True} # Change this if use an already normalized model
48
+ )
49
+
50
+
51
+ def _create_vector_store(self, collection_name: str) -> Chroma:
52
+ """
53
+ Create a vector store.
54
+
55
+ Args:
56
+ collection_name (str): Name of the vector store collection
57
+
58
+ Returns:
59
+ Chroma: The vector store
60
+ """
61
+ return Chroma(
62
+ collection_name = collection_name,
63
+ embedding_function = self.embedding_function,
64
+ )
65
+
66
+
67
+ def _create_retriever(self) -> MultiVectorRetriever:
68
+ """
69
+ Create a multi-vector retriever.
70
+
71
+ Returns:
72
+ MultiVectorRetriever: The retriever
73
+ """
74
+ return MultiVectorRetriever(
75
+ vectorstore = self.vector_store,
76
+ docstore = self.doc_store,
77
+ id_key = self.id_key,
78
+ )
79
+
80
+
81
+ def add_to_retriever(self, data: List[Any], data_summaries: List[str]) -> None:
82
+ """
83
+ Add data and summaries to the retriever.
84
+
85
+ Args:
86
+ data (List[Any]): List of data elements
87
+ data_summaries (List[str]): List of data summaries
88
+ """
89
+ if not data:
90
+ return
91
+
92
+ if len(data) != len(data_summaries):
93
+ raise ValueError(f"Length mismatch: {len(data)} data but {len(data_summaries)} summaries")
94
+
95
+ ids = [str(uuid.uuid4()) for _ in range(len(data))]
96
+
97
+ summaries = [
98
+ Document(
99
+ page_content = f"passage: {summary}", # Change this to suit with model requirements if use a different model
100
+ metadata = {self.id_key: i}
101
+ )
102
+ for i, summary in zip(ids, data_summaries)
103
+ ]
104
+
105
+ self.retriever.vectorstore.add_documents(summaries)
106
+ self.retriever.docstore.mset(list(zip(ids, data)))
107
+
108
+
109
+ def add_contents(self,
110
+ texts : List[Any], text_summaries : List[str],
111
+ tables: List[Any], table_summaries: List[str],
112
+ images: List[Any], image_summaries: List[str]) -> None:
113
+ """
114
+ Add all content types and their summaries to the retriever.
115
+
116
+ Args:
117
+ texts (List[Any]): List of text elements
118
+ text_summaries (List[str]): List of text summaries
119
+ tables (List[Any]): List of table elements
120
+ table_summaries (List[str]): List of table summaries
121
+ images (List[Any]): List of image elements
122
+ image_summaries (List[str]): List of image summaries
123
+ """
124
+ self.add_to_retriever(texts , text_summaries)
125
+ self.add_to_retriever(tables, table_summaries)
126
+ self.add_to_retriever(images, image_summaries)
127
+
128
+
129
+ def reset(self) -> None:
130
+ """Reset the vector store and document store."""
131
+ try:
132
+ self.vector_store.reset_collection()
133
+ except Exception as e:
134
+ raise RuntimeError(f"Failed to reset vector store: {e}")
135
+
136
+ # self.vector_store = self._create_vector_store(COLLECTION_NAME)
137
+ self.doc_store = InMemoryStore()
138
+ self.retriever = self._create_retriever()
139
+
140
+
141
+ def retrieve(self, query: str) -> List[Any]:
142
+ """
143
+ Retrieve relevant documents for a query.
144
+
145
+ Args:
146
+ query (str): The query string
147
+
148
+ Returns:
149
+ List[Any]: List of retrieved documents
150
+ """
151
+ return self.retriever.invoke(query)
static/css/modern-styles.css ADDED
@@ -0,0 +1,1177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Modern Styles for arXivCSRAG Application - 2025 Edition */
2
+
3
+ /* Color Scheme & Variables */
4
+ :root {
5
+ /* Core colors */
6
+ --primary: #4361ee;
7
+ --primary-light: #4cc9f0;
8
+ --primary-dark: #3a0ca3;
9
+ --secondary: #4b4d63;
10
+ --accent: #f72585;
11
+
12
+ /* Neutral colors */
13
+ --background: #f8f9fd;
14
+ --surface: #ffffff;
15
+ --surface-variant: #f0f2f9;
16
+
17
+ /* Text colors */
18
+ --text-primary: #1e1e2f;
19
+ --text-secondary: #4b4d63;
20
+ --text-tertiary: #6e7191;
21
+ --text-on-primary: #ffffff;
22
+ --text-on-accent: #ffffff;
23
+
24
+ /* Status colors */
25
+ --success: #06d6a0;
26
+ --warning: #ffd166;
27
+ --error: #ef476f;
28
+ --info: #118ab2;
29
+
30
+ /* Border colors */
31
+ --border: #e2e8f0;
32
+ --border-hover: #cbd5e1;
33
+
34
+ /* Shadows */
35
+ --shadow-sm: 0 1px 2px rgba(30, 30, 47, 0.05);
36
+ --shadow-md: 0 4px 8px rgba(30, 30, 47, 0.08);
37
+ --shadow-lg: 0 8px 16px rgba(30, 30, 47, 0.1);
38
+
39
+ /* Spacing */
40
+ --space-1: 0.25rem;
41
+ --space-2: 0.5rem;
42
+ --space-3: 0.75rem;
43
+ --space-4: 1rem;
44
+ --space-5: 1.5rem;
45
+ --space-6: 2rem;
46
+ --space-8: 3rem;
47
+
48
+ /* Dimensions */
49
+ --header-height: 70px;
50
+ --border-radius-sm: 6px;
51
+ --border-radius-md: 8px;
52
+ --border-radius-lg: 12px;
53
+ --border-radius-xl: 16px;
54
+
55
+ /* Typography */
56
+ --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
57
+ --font-mono: 'JetBrains Mono', SFMono-Regular, Menlo, Monaco, Consolas, monospace;
58
+
59
+ /* Transitions */
60
+ --transition-fast: 150ms cubic-bezier(0.4, 0, 0.2, 1);
61
+ --transition-normal: 250ms cubic-bezier(0.4, 0, 0.2, 1);
62
+
63
+ /* Color variable for RGB usage */
64
+ --primary-rgb: 67, 97, 238;
65
+ }
66
+
67
+ /* Base Reset */
68
+ *, *::before, *::after {
69
+ box-sizing: border-box;
70
+ margin: 0;
71
+ padding: 0;
72
+ }
73
+
74
+ html {
75
+ font-size: 16px;
76
+ scroll-behavior: smooth;
77
+ }
78
+
79
+ body {
80
+ font-family: var(--font-sans);
81
+ line-height: 1.5;
82
+ color: var(--text-primary);
83
+ background-color: var(--background);
84
+ -webkit-font-smoothing: antialiased;
85
+ -moz-osx-font-smoothing: grayscale;
86
+ overflow-x: hidden;
87
+ margin: 0;
88
+ padding: 0;
89
+ min-height: 100vh;
90
+ }
91
+
92
+ /* Typography */
93
+ h1, h2, h3, h4, h5, h6 {
94
+ font-weight: 600;
95
+ line-height: 1.2;
96
+ color: var(--text-primary);
97
+ margin-bottom: var(--space-4);
98
+ }
99
+
100
+ h1 {
101
+ font-size: 2.25rem;
102
+ letter-spacing: -0.025em;
103
+ }
104
+
105
+ h2 {
106
+ font-size: 1.75rem;
107
+ letter-spacing: -0.0125em;
108
+ }
109
+
110
+ h3 {
111
+ font-size: 1.5rem;
112
+ }
113
+
114
+ h4 {
115
+ font-size: 1.25rem;
116
+ }
117
+
118
+ p {
119
+ margin-bottom: var(--space-4);
120
+ }
121
+
122
+ a {
123
+ color: var(--primary);
124
+ text-decoration: none;
125
+ transition: color var(--transition-fast);
126
+ }
127
+
128
+ a:hover {
129
+ color: var(--primary-dark);
130
+ }
131
+
132
+ /* Layout */
133
+ .app-container {
134
+ max-width: 1600px;
135
+ margin: 0 auto;
136
+ padding: 0 var(--space-4);
137
+ min-height: 100vh;
138
+ display: flex;
139
+ flex-direction: column;
140
+ }
141
+
142
+ /* Header */
143
+ header {
144
+ height: var(--header-height);
145
+ display: flex;
146
+ align-items: center;
147
+ justify-content: space-between;
148
+ /* padding: 0 var(--space-5); */
149
+ border-bottom: 1px solid var(--border);
150
+ background-color: var(--surface);
151
+ }
152
+
153
+ .header-content {
154
+ display: flex;
155
+ align-items: center;
156
+ gap: var(--space-3);
157
+ }
158
+
159
+ .header-content h1 {
160
+ color: var(--primary);
161
+ margin-bottom: 0;
162
+ font-weight: 700;
163
+ }
164
+
165
+ .header-content p {
166
+ color: var(--text-tertiary);
167
+ margin-bottom: 0;
168
+ font-size: 3rem;
169
+ position: relative;
170
+ top: 2px;
171
+ }
172
+
173
+ /* Main Content */
174
+ main {
175
+ padding: var(--space-5) 0;
176
+ flex: 1;
177
+ }
178
+
179
+ .content-row {
180
+ display: flex;
181
+ gap: var(--space-5);
182
+ height: calc(100vh - var(--header-height) - var(--space-5) * 2 - 60px); /* 60px for footer */
183
+ }
184
+
185
+ .panel {
186
+ background-color: var(--surface);
187
+ border-radius: var(--border-radius-lg);
188
+ box-shadow: var(--shadow-md);
189
+ overflow: hidden;
190
+ display: flex;
191
+ flex-direction: column;
192
+ height: 100%;
193
+ }
194
+
195
+ .left-panel {
196
+ flex: 1;
197
+ min-width: 0; /* Prevents flex items from overflowing */
198
+ }
199
+
200
+ .right-panel {
201
+ flex: 1;
202
+ min-width: 0;
203
+ }
204
+
205
+ /* Nav Tabs */
206
+ .nav-tabs {
207
+ display: flex;
208
+ border-bottom: 1px solid var(--border);
209
+ background-color: var(--surface-variant);
210
+
211
+ min-height: 76px; /* Increase height while maintaining flexibility */
212
+ align-items: center; /* Vertically center the nav links */
213
+ }
214
+
215
+ .nav-link {
216
+ /* padding: var(--space-4) var(--space-5); */
217
+ padding-top: 0.8rem;
218
+ padding-bottom: 0.8rem;
219
+ padding-left: 0.6rem;
220
+ padding-right: 0.6rem;
221
+ font-size: 1rem;
222
+ font-weight: 500;
223
+ color: var(--text-tertiary);
224
+ border: none;
225
+ background: transparent;
226
+ cursor: pointer;
227
+ position: relative;
228
+ transition: color var(--transition-fast);
229
+
230
+ height: 100%; /* Make links fill the height of nav-tabs */
231
+ /* display: flex; */
232
+ align-items: center;
233
+ }
234
+
235
+ .nav-link:hover {
236
+ color: var(--primary);
237
+ }
238
+
239
+ .nav-link.active {
240
+ color: var(--primary);
241
+ font-weight: 600;
242
+ }
243
+
244
+ .nav-link.active::after {
245
+ content: '';
246
+ position: absolute;
247
+ bottom: 0;
248
+ left: 0;
249
+ width: 100%;
250
+ height: 2px;
251
+ background-color: var(--primary);
252
+ }
253
+
254
+ /* Tab Content */
255
+ .tab-content {
256
+ flex: 1;
257
+ overflow: auto;
258
+ padding: var(--space-5);
259
+ }
260
+
261
+ .tab-pane {
262
+ display: none;
263
+ }
264
+
265
+ .tab-pane.active {
266
+ display: block;
267
+ }
268
+
269
+ /* Search Form */
270
+ .search-section {
271
+ margin-bottom: var(--space-5);
272
+ }
273
+
274
+ .search-section h2 {
275
+ font-size: 1.25rem;
276
+ margin-bottom: var(--space-4);
277
+ color: var(--text-primary);
278
+ }
279
+
280
+ .filters-grid {
281
+ display: grid;
282
+ grid-template-columns: 1.4fr 0.6fr;
283
+ /* gap: var(--space-4); */
284
+ /* row-gap: 0.5rem; */
285
+ column-gap: 1rem;
286
+ /* margin-bottom: var(--space-5); */
287
+ margin-bottom: 0.3rem;
288
+ }
289
+
290
+ .filter-group {
291
+ margin-bottom: var(--space-4);
292
+ }
293
+
294
+ .filter-group label {
295
+ display: block;
296
+ margin-bottom: var(--space-2);
297
+ font-weight: 500;
298
+ font-size: 0.875rem;
299
+ color: var(--text-secondary);
300
+ }
301
+
302
+ .form-control {
303
+ width: 100%;
304
+ padding: var(--space-3);
305
+ border: 1px solid var(--border);
306
+ border-radius: var(--border-radius-sm);
307
+ font-family: var(--font-sans);
308
+ font-size: 0.9375rem;
309
+ background-color: var(--surface);
310
+ color: var(--text-primary);
311
+ transition: border-color var(--transition-fast), box-shadow var(--transition-fast);
312
+ }
313
+
314
+ .form-control:focus {
315
+ outline: none;
316
+ border-color: var(--primary-light);
317
+ box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.15);
318
+ }
319
+
320
+ .form-select {
321
+ appearance: none;
322
+ background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' viewBox='0 0 24 24' fill='none' stroke='%234b4d63' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'%3E%3C/polyline%3E%3C/svg%3E");
323
+ background-repeat: no-repeat;
324
+ background-position: right 0.75rem center;
325
+ background-size: 1rem;
326
+ padding-right: 2.5rem;
327
+ }
328
+
329
+ .date-range-inputs {
330
+ display: flex;
331
+ gap: var(--space-3);
332
+
333
+ flex-direction: column;
334
+ }
335
+
336
+ .btn {
337
+ display: inline-flex;
338
+ align-items: center;
339
+ justify-content: center;
340
+ padding: var(--space-3) var(--space-4);
341
+ font-weight: 500;
342
+ font-size: 0.9375rem;
343
+ border-radius: var(--border-radius-sm);
344
+ border: none;
345
+ cursor: pointer;
346
+ transition: background-color var(--transition-fast), transform var(--transition-fast);
347
+ }
348
+
349
+ .btn:active {
350
+ transform: translateY(1px);
351
+ }
352
+
353
+ .btn-primary {
354
+ background-color: var(--primary);
355
+ color: var(--text-on-primary);
356
+ }
357
+
358
+ .btn-primary:hover {
359
+ background-color: var(--primary-dark);
360
+ }
361
+
362
+ .btn-secondary {
363
+ background-color: var(--surface-variant);
364
+ color: var(--text-secondary);
365
+ }
366
+
367
+ .btn-secondary:hover {
368
+ background-color: var(--border);
369
+ }
370
+
371
+ .btn-outline {
372
+ background-color: transparent;
373
+ border: 1px solid var(--border);
374
+ color: var(--text-secondary);
375
+ }
376
+
377
+ .btn-outline:hover {
378
+ background-color: var(--surface-variant);
379
+ }
380
+
381
+ .btn i {
382
+ margin-right: var(--space-2);
383
+ }
384
+
385
+ .search-btn {
386
+ width: 100%;
387
+ /* margin-top: var(--space-4); */
388
+ }
389
+
390
+ .upload-section {
391
+ border-top: 1px solid var(--border);
392
+ padding-top: var(--space-4);
393
+ margin-top: 1rem;
394
+ }
395
+
396
+ .upload-section p {
397
+ margin-bottom: var(--space-3);
398
+ font-size: 0.9375rem;
399
+ font-weight: 500;
400
+ }
401
+
402
+ .upload-container {
403
+ display: flex;
404
+ flex-direction: column;
405
+ gap: var(--space-3);
406
+ }
407
+
408
+ .file-input-wrapper {
409
+ position: relative;
410
+ overflow: hidden;
411
+ display: inline-block;
412
+ width: 100%;
413
+ }
414
+
415
+ .file-input-button {
416
+ display: flex;
417
+ align-items: center;
418
+ gap: 8px;
419
+ background-color: var(--surface-variant);
420
+ color: var(--primary);
421
+ border: 2px dashed var(--primary-light);
422
+ border-radius: var(--border-radius-sm);
423
+ padding: 12px 16px;
424
+ font-weight: 500;
425
+ cursor: pointer;
426
+ transition: all 0.2s ease;
427
+ width: 100%;
428
+ justify-content: center;
429
+ }
430
+
431
+ .file-input-button:hover {
432
+ background-color: rgba(67, 97, 238, 0.05);
433
+ border-color: var(--primary);
434
+ }
435
+
436
+ .file-input-button i {
437
+ font-size: 1.2rem;
438
+ }
439
+
440
+ .file-name-display {
441
+ margin-top: 8px;
442
+ padding: 8px 12px;
443
+ background-color: var(--surface-variant);
444
+ border-radius: var(--border-radius-sm);
445
+ font-size: 0.875rem;
446
+ display: none;
447
+ align-items: center;
448
+ gap: 8px;
449
+ white-space: nowrap;
450
+ overflow: hidden;
451
+ text-overflow: ellipsis;
452
+ border: 1px solid var(--border);
453
+ }
454
+
455
+ .file-name-display.active {
456
+ display: flex;
457
+ }
458
+
459
+ .file-name-display i {
460
+ color: var(--success);
461
+ flex-shrink: 0;
462
+ font-size: 1rem;
463
+ }
464
+
465
+ .file-name-display span {
466
+ overflow: hidden;
467
+ text-overflow: ellipsis;
468
+ font-weight: 500;
469
+ }
470
+
471
+ #pdf-upload {
472
+ position: absolute;
473
+ left: 0;
474
+ top: 0;
475
+ opacity: 0;
476
+ width: 100%;
477
+ height: 100%;
478
+ cursor: pointer;
479
+ }
480
+
481
+ #upload-button {
482
+ width: 100%;
483
+ display: flex;
484
+ align-items: center;
485
+ justify-content: center;
486
+ gap: 8px;
487
+ padding: 10px 16px;
488
+ }
489
+
490
+ #upload-button.file-selected {
491
+ background-color: var(--success);
492
+ border-color: var(--success);
493
+ }
494
+
495
+ #upload-button i {
496
+ font-size: 1.1rem;
497
+ }
498
+
499
+ /* Results List */
500
+ .results-container {
501
+ padding-top: var(--space-2);
502
+ }
503
+
504
+ .no-results {
505
+ text-align: center;
506
+ color: var(--text-tertiary);
507
+ padding: var(--space-8) 0;
508
+ }
509
+
510
+ .paper-item {
511
+ border: 1px solid var(--border);
512
+ border-radius: var(--border-radius-md);
513
+ padding: var(--space-4);
514
+ margin-bottom: var(--space-4);
515
+ cursor: pointer;
516
+ transition: border-color var(--transition-normal), box-shadow var(--transition-normal), transform var(--transition-normal);
517
+ }
518
+
519
+ .paper-item:hover {
520
+ border-color: var(--primary-light);
521
+ box-shadow: var(--shadow-md);
522
+ transform: translateY(-2px);
523
+ }
524
+
525
+ .paper-item-header {
526
+ display: flex;
527
+ justify-content: space-between;
528
+ margin-bottom: var(--space-3);
529
+ }
530
+
531
+ .paper-title {
532
+ font-weight: 600;
533
+ color: var(--primary);
534
+ margin-bottom: var(--space-2);
535
+ font-size: 1.0625rem;
536
+
537
+ flex: 1;
538
+ margin-right: var(--space-3); /* Tạo khoảng cách với date */
539
+ white-space: nowrap;
540
+ overflow: hidden;
541
+ text-overflow: ellipsis;
542
+ }
543
+
544
+ .paper-date {
545
+ color: var(--text-tertiary);
546
+ font-size: 0.875rem;
547
+ }
548
+
549
+ .paper-authors {
550
+ margin-bottom: var(--space-3);
551
+ font-size: 0.9375rem;
552
+ color: var(--text-secondary);
553
+ }
554
+
555
+ .paper-categories {
556
+ display: flex;
557
+ flex-wrap: wrap;
558
+ gap: var(--space-2);
559
+ margin-top: var(--space-3);
560
+ }
561
+
562
+ .paper-category {
563
+ background-color: var(--surface-variant);
564
+ padding: var(--space-1) var(--space-2);
565
+ border-radius: var(--border-radius-sm);
566
+ font-size: 0.75rem;
567
+ color: var(--text-tertiary);
568
+ }
569
+
570
+ /* Chat Interface */
571
+ .chat-panel {
572
+ display: flex;
573
+ flex-direction: column;
574
+ height: 100%;
575
+ }
576
+
577
+ .chat-header {
578
+ padding: var(--space-4) var(--space-5);
579
+ border-bottom: 1px solid var(--border);
580
+ display: flex;
581
+ justify-content: space-between;
582
+ align-items: center;
583
+ background-color: var(--surface-variant);
584
+ }
585
+
586
+ .chat-header h2 {
587
+ margin-bottom: 0;
588
+ font-size: 1.25rem;
589
+ }
590
+
591
+ .chat-messages {
592
+ flex: 1;
593
+ overflow-y: auto;
594
+ padding: var(--space-5);
595
+ display: flex;
596
+ flex-direction: column;
597
+ gap: var(--space-4);
598
+ }
599
+
600
+ .system-message {
601
+ text-align: center;
602
+ /* padding: var(--space-8) var(--space-4); */
603
+ color: var(--text-tertiary);
604
+ }
605
+
606
+ .message {
607
+ max-width: 85%;
608
+ word-wrap: break-word;
609
+ }
610
+
611
+ .user-message {
612
+ align-self: flex-end;
613
+ background-color: var(--primary);
614
+ color: var(--text-on-primary);
615
+ border-radius: var(--border-radius-lg) var(--border-radius-lg) 0 var(--border-radius-lg);
616
+ padding: var(--space-3) var(--space-4);
617
+ box-shadow: var(--shadow-sm);
618
+ }
619
+
620
+ .bot-message {
621
+ align-self: flex-start;
622
+ background-color: var(--surface-variant);
623
+ border-radius: var(--border-radius-lg) var(--border-radius-lg) var(--border-radius-lg) 0;
624
+ padding: var(--space-3) var(--space-4);
625
+ box-shadow: var(--shadow-sm);
626
+ }
627
+
628
+ .citations {
629
+ margin-top: var(--space-2);
630
+ font-size: 0.75rem;
631
+ color: var(--text-tertiary);
632
+ border-top: 1px solid var(--border);
633
+ padding-top: var(--space-2);
634
+ }
635
+
636
+ .citations-header {
637
+ display: flex;
638
+ justify-content: flex-end;
639
+ align-items: center;
640
+ margin-bottom: var(--space-2);
641
+ }
642
+
643
+ .view-all-citations-btn {
644
+ background-color: transparent;
645
+ color: var(--primary);
646
+ border: 1px solid var(--primary);
647
+ border-radius: var(--border-radius-sm);
648
+ padding: 3px 10px;
649
+ font-size: 0.8rem;
650
+ cursor: pointer;
651
+ transition: all 0.2s;
652
+ white-space: nowrap;
653
+ }
654
+
655
+ .view-all-citations-btn:hover {
656
+ background-color: var(--primary);
657
+ color: var(--text-on-primary);
658
+ }
659
+
660
+ .chat-input-container {
661
+ padding: var(--space-4);
662
+ border-top: 1px solid var(--border);
663
+ display: flex;
664
+ align-items: center;
665
+ gap: var(--space-3);
666
+ }
667
+
668
+ #chat-input {
669
+ resize: none;
670
+ flex: 1;
671
+ border-radius: var(--border-radius-lg);
672
+ padding: var(--space-3) var(--space-4);
673
+ min-height: 54px;
674
+ max-height: 150px;
675
+
676
+ /* căn giữa nội dung/placeholder theo chiều dọc */
677
+ display: flex;
678
+ align-items: center;
679
+ }
680
+
681
+ #send-message-btn {
682
+ width: 54px;
683
+ height: 54px;
684
+ border-radius: 50%;
685
+ padding: 0;
686
+ flex-shrink: 0;
687
+ }
688
+
689
+ #send-message-btn i {
690
+ margin-right: 0;
691
+ font-size: 1.25rem;
692
+ flex: 1;
693
+ }
694
+
695
+ /* Modals */
696
+ .modal {
697
+ display: none;
698
+ position: fixed;
699
+ top: 0;
700
+ left: 0;
701
+ width: 100%;
702
+ height: 100%;
703
+ background-color: rgba(30, 30, 47, 0.5);
704
+ backdrop-filter: blur(4px);
705
+ z-index: 1000;
706
+ overflow: auto;
707
+ padding: var(--space-4);
708
+ }
709
+
710
+ .modal-content {
711
+ background-color: var(--surface);
712
+ margin: var(--space-6) auto;
713
+ width: 92%;
714
+ max-width: 1000px;
715
+ border-radius: var(--border-radius-lg);
716
+ box-shadow: var(--shadow-lg);
717
+ animation: modalOpen 0.3s ease forwards;
718
+ overflow: hidden;
719
+ max-height: calc(100vh - 60px);
720
+ display: flex;
721
+ flex-direction: column;
722
+ }
723
+
724
+ @keyframes modalOpen {
725
+ from {
726
+ opacity: 0;
727
+ transform: translateY(-20px) scale(0.95);
728
+ }
729
+ to {
730
+ opacity: 1;
731
+ transform: translateY(0) scale(1);
732
+ }
733
+ }
734
+
735
+ .modal-header {
736
+ padding: var(--space-4) var(--space-5);
737
+ border-bottom: 1px solid var(--border);
738
+ display: flex;
739
+ justify-content: space-between;
740
+ align-items: center;
741
+ }
742
+
743
+ .modal-header h3 {
744
+ margin-bottom: 0;
745
+ font-size: 1.25rem;
746
+ }
747
+
748
+ .close-modal {
749
+ background: none;
750
+ border: none;
751
+ font-size: 1.5rem;
752
+ cursor: pointer;
753
+ color: var(--text-tertiary);
754
+ transition: color var(--transition-fast);
755
+ line-height: 1;
756
+ }
757
+
758
+ .close-modal:hover {
759
+ color: var(--error);
760
+ }
761
+
762
+ .modal-body {
763
+ padding: var(--space-5);
764
+ overflow-y: auto;
765
+ flex: 1;
766
+ }
767
+
768
+ .modal-footer {
769
+ padding: var(--space-4) var(--space-5);
770
+ border-top: 1px solid var(--border);
771
+ display: flex;
772
+ justify-content: flex-end;
773
+ gap: var(--space-3);
774
+ }
775
+
776
+ .form-group {
777
+ margin-bottom: var(--space-4);
778
+ }
779
+
780
+ .form-group label {
781
+ display: block;
782
+ margin-bottom: var(--space-2);
783
+ font-weight: 500;
784
+ }
785
+
786
+ .paper-metadata p {
787
+ margin-bottom: var(--space-3);
788
+ }
789
+
790
+ .paper-abstract {
791
+ margin-top: var(--space-5);
792
+ padding-top: var(--space-4);
793
+ border-top: 1px solid var(--border);
794
+ }
795
+
796
+ .paper-abstract h4 {
797
+ margin-bottom: var(--space-3);
798
+ color: var(--text-secondary);
799
+ }
800
+
801
+ /* Loading Overlay */
802
+ .loading-overlay {
803
+ display: none;
804
+ position: fixed;
805
+ top: 0;
806
+ left: 0;
807
+ width: 100%;
808
+ height: 100%;
809
+ background-color: rgba(30, 30, 47, 0.7);
810
+ backdrop-filter: blur(4px);
811
+ z-index: 2000;
812
+ justify-content: center;
813
+ align-items: center;
814
+ }
815
+
816
+ .spinner-container {
817
+ text-align: center;
818
+ color: white;
819
+ max-width: 200px;
820
+
821
+ /* Fix alignment */
822
+ display: flex;
823
+ flex-direction: column; /* Dọc: spinner trên, text dưới */
824
+ align-items: center; /* Căn giữa ngang */
825
+ justify-content: center; /* Căn giữa dọc */
826
+ }
827
+
828
+ .spinner-border {
829
+ width: 3rem;
830
+ height: 3rem;
831
+ border: 0.25rem solid rgba(255, 255, 255, 0.3);
832
+ border-radius: 50%;
833
+ border-top-color: white;
834
+ animation: spinner 0.8s linear infinite;
835
+ }
836
+
837
+ .spinner-text {
838
+ margin-top: var(--space-4);
839
+ font-size: 1rem;
840
+ font-weight: 500;
841
+
842
+ max-width: 120px;
843
+ word-wrap: break-word; /* Xuống dòng khi cần */
844
+ text-align: center; /* Căn giữa text khi wrap */
845
+ }
846
+
847
+ .spinner-border {
848
+ width: 3rem;
849
+ height: 3rem;
850
+ border: 0.25rem solid rgba(255, 255, 255, 0.3);
851
+ border-radius: 50%;
852
+ border-top-color: white;
853
+ animation: spinner 0.8s linear infinite;
854
+ }
855
+
856
+ @keyframes spinner {
857
+ to {
858
+ transform: rotate(360deg);
859
+ }
860
+ }
861
+
862
+ #loading-message {
863
+ margin-top: var(--space-4);
864
+ font-size: 1rem;
865
+ font-weight: 500;
866
+ width: 500px;
867
+ text-align: center;
868
+ }
869
+
870
+ /* Citations Modal Styles */
871
+ .citation-query {
872
+ font-style: italic;
873
+ margin-bottom: var(--space-4);
874
+ padding: var(--space-3);
875
+ background-color: var(--surface-variant);
876
+ border-left: 3px solid var(--primary);
877
+ border-radius: var(--border-radius-sm);
878
+ font-size: 1rem;
879
+ }
880
+
881
+ .citations-container {
882
+ overflow-y: auto;
883
+ }
884
+
885
+ .citation-section {
886
+ margin-bottom: var(--space-6);
887
+ border-bottom: 1px solid var(--border);
888
+ padding-bottom: var(--space-4);
889
+ }
890
+
891
+ .citation-section:last-child {
892
+ border-bottom: none;
893
+ }
894
+
895
+ .citation-section h4 {
896
+ color: var(--secondary);
897
+ margin-bottom: var(--space-3);
898
+ font-size: 1.1rem;
899
+ }
900
+
901
+ .citation-list {
902
+ display: flex;
903
+ flex-direction: column;
904
+ gap: var(--space-4);
905
+ }
906
+
907
+ .text-citation, .table-citation, .image-citation {
908
+ display: flex;
909
+ gap: var(--space-3);
910
+ background-color: var(--surface-variant);
911
+ border-radius: var(--border-radius-sm);
912
+ padding: var(--space-3);
913
+ box-shadow: var(--shadow-sm);
914
+ align-items: flex-start;
915
+ overflow: hidden;
916
+ width: 100%;
917
+ }
918
+
919
+ .citation-number {
920
+ background-color: var(--primary);
921
+ color: var(--text-on-primary);
922
+ width: 24px;
923
+ height: 24px;
924
+ min-width: 24px;
925
+ border-radius: 50%;
926
+ display: flex;
927
+ justify-content: center;
928
+ align-items: center;
929
+ font-size: 0.8rem;
930
+ flex-shrink: 0;
931
+ }
932
+
933
+ .citation-text {
934
+ line-height: 1.5;
935
+ font-size: 0.9rem;
936
+ }
937
+
938
+ .citation-table-container {
939
+ width: 100%;
940
+ overflow-x: auto;
941
+ max-width: calc(100% - 40px); /* Account for the citation number */
942
+ border-radius: var(--border-radius-sm);
943
+ }
944
+
945
+ .citation-table {
946
+ font-size: 0.9rem;
947
+ overflow: hidden;
948
+ }
949
+
950
+ .citation-table table {
951
+ border-collapse: collapse;
952
+ width: 100%;
953
+ max-width: 100%;
954
+ table-layout: auto;
955
+ margin: 0;
956
+ }
957
+
958
+ .citation-table th, .citation-table td {
959
+ border: 1px solid var(--border);
960
+ padding: 4px 8px;
961
+ word-break: break-word;
962
+ }
963
+
964
+ .citation-table th {
965
+ background-color: var(--surface-variant);
966
+ }
967
+
968
+ .citation-image-container {
969
+ width: 100%;
970
+ max-width: calc(100% - 40px); /* Account for the citation number */
971
+ display: flex;
972
+ justify-content: center;
973
+ overflow: hidden;
974
+ margin: 0 auto;
975
+ }
976
+
977
+ /* Citation Image Spinner */
978
+ .citation-image-loading {
979
+ display: flex;
980
+ justify-content: center;
981
+ align-items: center;
982
+ min-height: 100px;
983
+ color: var(--text-tertiary);
984
+ }
985
+
986
+ .responsive-image {
987
+ max-width: 100%;
988
+ max-height: 280px;
989
+ object-fit: contain;
990
+ border-radius: var(--border-radius-sm);
991
+ display: block;
992
+ }
993
+
994
+ /* Footer Styles */
995
+ .app-footer {
996
+ /* padding: var(--space-4) var(--space-5); */
997
+ /* background-color: var(--surface-variant); */
998
+ border-top: 1px solid var(--border);
999
+ margin-top: auto;
1000
+ }
1001
+
1002
+ .footer-content {
1003
+ display: flex;
1004
+ justify-content: space-between;
1005
+ align-items: center;
1006
+ /* max-width: 1200px; */
1007
+ /* margin: 0 auto; */
1008
+ /* font-size: 0.9rem; */
1009
+ color: var(--text-secondary);
1010
+ margin-bottom: 20px;
1011
+ margin-top: 10px;
1012
+ }
1013
+
1014
+ .footer-content p {
1015
+ margin: 0;
1016
+ display: flex;
1017
+ align-items: center;
1018
+ height: 100%;
1019
+ }
1020
+
1021
+ .github-link {
1022
+ display: flex;
1023
+ align-items: center;
1024
+ gap: var(--space-2);
1025
+ color: var(--primary);
1026
+ text-decoration: none;
1027
+ transition: color 0.2s ease;
1028
+ }
1029
+
1030
+ .github-link:hover {
1031
+ color: var(--primary-dark);
1032
+ text-decoration: underline;
1033
+ }
1034
+
1035
+ .github-link i {
1036
+ font-size: 1.1rem;
1037
+ }
1038
+
1039
+ /* Tooltip Styles */
1040
+ .tooltip-container {
1041
+ position: relative;
1042
+ display: inline-flex;
1043
+ /* align-items: center; */
1044
+ }
1045
+
1046
+ .tooltip-icon {
1047
+ margin-left: var(--space-2);
1048
+ color: var(--text-tertiary);
1049
+ cursor: help;
1050
+ font-size: 1rem;
1051
+ }
1052
+
1053
+ .tooltip-icon:hover {
1054
+ color: var(--primary);
1055
+ }
1056
+
1057
+ .tooltip-text {
1058
+ visibility: hidden;
1059
+ position: absolute;
1060
+ z-index: 100;
1061
+ bottom: 125%;
1062
+ /* left: 20%; */
1063
+ transform: translateX(5%) translateY(100%);
1064
+ background-color: var(--text-primary);
1065
+ color: white;
1066
+ text-align: center;
1067
+ border-radius: var(--border-radius-sm);
1068
+ padding: var(--space-2) var(--space-3);
1069
+ width: max-content;
1070
+ max-width: 400px;
1071
+ font-size: 0.7rem;
1072
+ box-shadow: var(--shadow-md);
1073
+ opacity: 0;
1074
+ transition: opacity 0.3s;
1075
+ }
1076
+
1077
+ /* .tooltip-text::after {
1078
+ content: "";
1079
+ position: absolute;
1080
+ top: 100%;
1081
+ left: 50%;
1082
+ margin-left: -5px;
1083
+ border-width: 5px;
1084
+ border-style: solid;
1085
+ border-color: var(--text-primary) transparent transparent transparent;
1086
+ } */
1087
+
1088
+ .tooltip-container:hover .tooltip-text {
1089
+ visibility: visible;
1090
+ opacity: 1;
1091
+ }
1092
+
1093
+ /* Memory feature styling */
1094
+ .memory-info {
1095
+ display: block;
1096
+ margin-top: 10px;
1097
+ padding: 8px 12px;
1098
+ background-color: rgba(var(--primary-rgb), 0.1);
1099
+ border-left: 3px solid var(--primary);
1100
+ border-radius: 4px;
1101
+ font-style: italic;
1102
+ color: var(--primary-dark);
1103
+ }
1104
+
1105
+ /* Responsive Styles */
1106
+ @media (max-width: 1200px) {
1107
+ .filters-grid {
1108
+ grid-template-columns: 1fr;
1109
+ }
1110
+
1111
+ .content-row {
1112
+ flex-direction: column;
1113
+ height: auto;
1114
+ gap: var(--space-4);
1115
+ }
1116
+
1117
+ .panel {
1118
+ height: 600px;
1119
+ }
1120
+ }
1121
+
1122
+ @media (max-width: 768px) {
1123
+ .header-content h1 {
1124
+ font-size: 1.75rem;
1125
+ }
1126
+
1127
+ .header-content p {
1128
+ display: none;
1129
+ }
1130
+
1131
+ .date-range-inputs {
1132
+ flex-direction: column;
1133
+ }
1134
+
1135
+ .upload-container {
1136
+ flex-direction: column;
1137
+ }
1138
+
1139
+ .modal-content {
1140
+ margin: var(--space-4) auto;
1141
+ }
1142
+ }
1143
+
1144
+ @media (max-width: 576px) {
1145
+ :root {
1146
+ --space-5: 1.25rem;
1147
+ }
1148
+
1149
+ .header-content h1 {
1150
+ font-size: 1.5rem;
1151
+ }
1152
+
1153
+ .app-container {
1154
+ padding: 0 var(--space-3);
1155
+ }
1156
+
1157
+ .panel {
1158
+ border-radius: var(--border-radius-md);
1159
+ }
1160
+
1161
+ .chat-messages {
1162
+ padding: var(--space-3);
1163
+ }
1164
+ }
1165
+
1166
+ /* Utility Classes */
1167
+ .hidden {
1168
+ display: none !important;
1169
+ }
1170
+
1171
+ .show {
1172
+ display: flex !important;
1173
+ }
1174
+
1175
+ /* Add Google Fonts */
1176
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
1177
+ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');
static/data/arxiv_cs_subjects.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "tag": "cs.AI",
4
+ "name": "Artificial Intelligence"
5
+ },
6
+ {
7
+ "tag": "cs.CL",
8
+ "name": "Computation and Language"
9
+ },
10
+ {
11
+ "tag": "cs.CC",
12
+ "name": "Computational Complexity"
13
+ },
14
+ {
15
+ "tag": "cs.CE",
16
+ "name": "Computational Engineering, Finance, and Science"
17
+ },
18
+ {
19
+ "tag": "cs.CG",
20
+ "name": "Computational Geometry"
21
+ },
22
+ {
23
+ "tag": "cs.GT",
24
+ "name": "Computer Science and Game Theory"
25
+ },
26
+ {
27
+ "tag": "cs.CV",
28
+ "name": "Computer Vision and Pattern Recognition"
29
+ },
30
+ {
31
+ "tag": "cs.CY",
32
+ "name": "Computers and Society"
33
+ },
34
+ {
35
+ "tag": "cs.CR",
36
+ "name": "Cryptography and Security"
37
+ },
38
+ {
39
+ "tag": "cs.DS",
40
+ "name": "Data Structures and Algorithms"
41
+ },
42
+ {
43
+ "tag": "cs.DB",
44
+ "name": "Databases"
45
+ },
46
+ {
47
+ "tag": "cs.DL",
48
+ "name": "Digital Libraries"
49
+ },
50
+ {
51
+ "tag": "cs.DM",
52
+ "name": "Discrete Mathematics"
53
+ },
54
+ {
55
+ "tag": "cs.DC",
56
+ "name": "Distributed, Parallel, and Cluster Computing"
57
+ },
58
+ {
59
+ "tag": "cs.ET",
60
+ "name": "Emerging Technologies"
61
+ },
62
+ {
63
+ "tag": "cs.FL",
64
+ "name": "Formal Languages and Automata Theory"
65
+ },
66
+ {
67
+ "tag": "cs.GL",
68
+ "name": "General Literature"
69
+ },
70
+ {
71
+ "tag": "cs.GR",
72
+ "name": "Graphics"
73
+ },
74
+ {
75
+ "tag": "cs.AR",
76
+ "name": "Hardware Architecture"
77
+ },
78
+ {
79
+ "tag": "cs.HC",
80
+ "name": "Human-Computer Interaction"
81
+ },
82
+ {
83
+ "tag": "cs.IR",
84
+ "name": "Information Retrieval"
85
+ },
86
+ {
87
+ "tag": "cs.IT",
88
+ "name": "Information Theory"
89
+ },
90
+ {
91
+ "tag": "cs.LO",
92
+ "name": "Logic in Computer Science"
93
+ },
94
+ {
95
+ "tag": "cs.LG",
96
+ "name": "Machine Learning"
97
+ },
98
+ {
99
+ "tag": "cs.MS",
100
+ "name": "Mathematical Software"
101
+ },
102
+ {
103
+ "tag": "cs.MA",
104
+ "name": "Multiagent Systems"
105
+ },
106
+ {
107
+ "tag": "cs.MM",
108
+ "name": "Multimedia"
109
+ },
110
+ {
111
+ "tag": "cs.NI",
112
+ "name": "Networking and Internet Architecture"
113
+ },
114
+ {
115
+ "tag": "cs.NE",
116
+ "name": "Neural and Evolutionary Computing"
117
+ },
118
+ {
119
+ "tag": "cs.NA",
120
+ "name": "Numerical Analysis"
121
+ },
122
+ {
123
+ "tag": "cs.OS",
124
+ "name": "Operating Systems"
125
+ },
126
+ {
127
+ "tag": "cs.OH",
128
+ "name": "Other Computer Science"
129
+ },
130
+ {
131
+ "tag": "cs.PF",
132
+ "name": "Performance"
133
+ },
134
+ {
135
+ "tag": "cs.PL",
136
+ "name": "Programming Languages"
137
+ },
138
+ {
139
+ "tag": "cs.RO",
140
+ "name": "Robotics"
141
+ },
142
+ {
143
+ "tag": "cs.SI",
144
+ "name": "Social and Information Networks"
145
+ },
146
+ {
147
+ "tag": "cs.SE",
148
+ "name": "Software Engineering"
149
+ },
150
+ {
151
+ "tag": "cs.SD",
152
+ "name": "Sound"
153
+ },
154
+ {
155
+ "tag": "cs.SC",
156
+ "name": "Symbolic Computation"
157
+ },
158
+ {
159
+ "tag": "cs.SY",
160
+ "name": "Systems and Control"
161
+ }
162
+ ]
static/index.html ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <meta name="description" content="A multi-modal RAG application for arXiv CS papers that allows searching and intelligent conversations">
7
+ <title>arXivCSRAG - Multi-Modal RAG Application</title>
8
+
9
+ <!-- Stylesheets -->
10
+ <link rel="stylesheet" href="/static/css/modern-styles.css">
11
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css">
12
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css">
13
+
14
+ <!-- Favicon -->
15
+ <link rel="icon" type="image/svg+xml" href="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'%3E%3Cpath fill='%234361ee' d='M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-5 14H7v-2h7v2zm3-4H7v-2h10v2zm0-4H7V7h10v2z'/%3E%3C/svg%3E">
16
+ </head>
17
+ <body>
18
+ <div class="app-container">
19
+ <!-- Header -->
20
+ <header>
21
+ <div class="header-content">
22
+ <h1>arXivCSRAG</h1>
23
+ <!-- <p>Multi-Modal RAG Application</p> -->
24
+ </div>
25
+ <button id="configure-api-keys-btn" class="btn btn-primary">
26
+ <i class="bi bi-key-fill"></i> Configure API Keys
27
+ </button>
28
+ </header>
29
+
30
+ <main>
31
+ <div class="content-row">
32
+ <!-- Left Panel: Search and Results -->
33
+ <div class="left-panel panel">
34
+ <div class="nav-tabs" role="tablist">
35
+ <button class="nav-link active" id="search-tab" data-tab="search-panel" role="tab" aria-selected="true">
36
+ <i class="bi bi-search"></i> Search
37
+ </button>
38
+ <button class="nav-link" id="results-tab" data-tab="results-panel" role="tab" aria-selected="false">
39
+ <i class="bi bi-list-ul"></i> Results
40
+ </button>
41
+ </div>
42
+
43
+ <div class="tab-content">
44
+ <div class="tab-pane active" id="search-panel" role="tabpanel">
45
+ <!-- Search Panel Section -->
46
+ <div class="search-section">
47
+ <!-- <h2>Search arXiv Papers</h2> -->
48
+
49
+ <div class="filters-grid">
50
+ <!-- Subject Tags Filter -->
51
+ <div class="filter-group">
52
+ <div class="tooltip-container">
53
+ <label for="subject-tags-select">Subject Areas</label>
54
+ <i class="bi bi-info-circle tooltip-icon">
55
+ <span class="tooltip-text">Hold Ctrl (or Cmd on Mac) and click to select multiple tags.</span>
56
+ </i>
57
+ </div>
58
+ <select id="subject-tags-select" class="form-control form-select" multiple>
59
+ <!-- Will be populated via JavaScript -->
60
+ </select>
61
+ </div>
62
+
63
+ <!-- Date Range Filter -->
64
+ <div class="filter-group">
65
+ <label>Publication Date Range</label>
66
+ <div class="date-range-inputs">
67
+ <input type="date" id="start-date" class="form-control" placeholder="From">
68
+ <input type="date" id="end-date" class="form-control" placeholder="To">
69
+ </div>
70
+ </div>
71
+
72
+ <!-- Search Query -->
73
+ <div class="filter-group">
74
+ <label for="search-query">Search Query</label>
75
+ <input type="text" id="search-query" class="form-control" placeholder="Enter keywords, phrases...">
76
+ </div>
77
+
78
+ <!-- Max Results Filter -->
79
+ <div class="filter-group">
80
+ <label for="max-results">Max Results</label>
81
+ <input type="number" id="max-results" class="form-control" value="10" min="1" max="100">
82
+ </div>
83
+
84
+ </div>
85
+
86
+ <!-- Search Button -->
87
+ <button id="search-button" class="btn btn-primary search-btn">
88
+ <i class="bi bi-search"></i> Search for arXiv Papers
89
+ </button>
90
+
91
+ <!-- Alternative: Upload PDF -->
92
+ <div class="upload-section">
93
+ <p>Or upload any PDF file: </p>
94
+ <div class="upload-container">
95
+ <div class="file-input-wrapper">
96
+ <label class="file-input-button" for="pdf-upload">
97
+ <i class="bi bi-file-earmark-pdf"></i>
98
+ <span>Choose PDF File</span>
99
+ </label>
100
+ <input type="file" id="pdf-upload" accept=".pdf">
101
+ </div>
102
+ <div class="file-name-display">
103
+ <i class="bi bi-check-circle"></i>
104
+ <span id="selected-file-name">No file selected</span>
105
+ </div>
106
+ <button id="upload-button" class="btn btn-primary">
107
+ <i class="bi bi-upload"></i> Upload PDF
108
+ </button>
109
+ </div>
110
+ </div>
111
+ </div>
112
+ </div>
113
+
114
+ <div class="tab-pane" id="results-panel" role="tabpanel">
115
+ <!-- Search Results Section -->
116
+ <div class="results-section">
117
+ <!-- <h2>Search Results</h2> -->
118
+ <div id="results-container" class="results-container">
119
+ <!-- Will be populated via JavaScript -->
120
+ <p class="no-results">No results to display. Search for papers above.</p>
121
+ </div>
122
+ </div>
123
+ </div>
124
+ </div>
125
+ </div>
126
+
127
+ <!-- Right Panel: Chat Interface -->
128
+ <div class="right-panel panel">
129
+ <div class="chat-panel">
130
+ <div class="chat-header">
131
+ <h2><i class="bi bi-chat-left-text"></i> Chat with Paper</h2>
132
+ <button id="reset-chat-btn" class="btn btn-outline" disabled>
133
+ <i class="bi bi-arrow-repeat"></i> Reset Chat
134
+ </button>
135
+ </div>
136
+
137
+ <!-- Chat Messages Container -->
138
+ <div id="chat-messages" class="chat-messages">
139
+ <div class="system-message">
140
+ <p>Select a paper from the search results or upload a PDF to start chatting.</p>
141
+ </div>
142
+ </div>
143
+
144
+ <!-- Chat Input -->
145
+ <div class="chat-input-container">
146
+ <textarea id="chat-input" class="form-control" placeholder="Ask a question about the paper..." disabled></textarea>
147
+ <button id="send-message-btn" class="btn btn-primary" disabled>
148
+ <i class="bi bi-send-fill"></i>
149
+ </button>
150
+ </div>
151
+ </div>
152
+ </div>
153
+ </div>
154
+ </main>
155
+
156
+ <!-- Footer -->
157
+ <footer class="app-footer">
158
+ <div class="footer-content">
159
+ <p>© 2025 arXivCSRAG</p>
160
+ <a href="https://github.com/YuITC/arXivRAG-Multimodal-RAG-Chatbot-Application" target="_blank" class="github-link">
161
+ <i class="bi bi-github"></i> GitHub Repository
162
+ </a>
163
+ </div>
164
+ </footer>
165
+ </div>
166
+
167
+ <!-- API Keys Configuration Modal -->
168
+ <div id="api-keys-modal" class="modal">
169
+ <div class="modal-content">
170
+ <div class="modal-header">
171
+ <h3><i class="bi bi-key"></i> Configure API Keys</h3>
172
+ <button class="close-modal">&times;</button>
173
+ </div>
174
+ <div class="modal-body">
175
+ <p>Enter your API keys to use the application:</p>
176
+
177
+ <div class="form-group">
178
+ <label for="gemini-api-key">Google Gemini API Key</label>
179
+ <input type="text" id="gemini-api-key" class="form-control" placeholder="Enter your Gemini API key">
180
+ </div>
181
+
182
+ <div class="form-group">
183
+ <label for="huggingface-token">Hugging Face Token</label>
184
+ <input type="text" id="huggingface-token" class="form-control" placeholder="Enter your Hugging Face token">
185
+ </div>
186
+ </div>
187
+ <div class="modal-footer">
188
+ <button id="save-api-keys-btn" class="btn btn-primary">Save</button>
189
+ <button class="btn btn-secondary close-btn">Cancel</button>
190
+ </div>
191
+ </div>
192
+ </div>
193
+
194
+ <!-- Paper Information Modal -->
195
+ <div id="paper-info-modal" class="modal">
196
+ <div class="modal-content">
197
+ <div class="modal-header">
198
+ <h3 id="paper-title">Paper Title</h3>
199
+ <button class="close-modal">&times;</button>
200
+ </div>
201
+ <div class="modal-body">
202
+ <div class="paper-metadata">
203
+ <p><strong>Authors:</strong> <span id="paper-authors"></span></p>
204
+ <p><strong>Published:</strong> <span id="paper-published"></span></p>
205
+ <p><strong>Categories:</strong> <span id="paper-categories"></span></p>
206
+ <p><strong>arXiv ID:</strong> <span id="paper-id"></span></p>
207
+ </div>
208
+
209
+ <div class="paper-abstract">
210
+ <h4>Abstract</h4>
211
+ <p id="paper-abstract"></p>
212
+ </div>
213
+ </div>
214
+ <div class="modal-footer">
215
+ <button id="open-arxiv-btn" class="btn btn-outline">
216
+ <i class="bi bi-box-arrow-up-right"></i> Open arXiv Page
217
+ </button>
218
+ <button id="view-pdf-btn" class="btn btn-outline">
219
+ <i class="bi bi-file-earmark-pdf"></i> View PDF
220
+ </button>
221
+ <button id="download-pdf-btn" class="btn btn-outline">
222
+ <i class="bi bi-download"></i> Download PDF
223
+ </button>
224
+ <button id="chat-with-paper-btn" class="btn btn-primary">
225
+ <i class="bi bi-chat-dots"></i> Chat with Paper
226
+ </button>
227
+ </div>
228
+ </div>
229
+ </div>
230
+
231
+ <!-- Loading Overlay -->
232
+ <!-- <div id="loading-overlay" class="loading-overlay">
233
+ <div class="spinner-container">
234
+ <div class="spinner-border" role="status">
235
+ <span class="visually-hidden">Loading...</span>
236
+ </div>
237
+ <p id="loading-message">Processing...</p>
238
+ </div>
239
+ </div> -->
240
+ <div id="loading-overlay" class="loading-overlay">
241
+ <div class="spinner-container">
242
+ <div class="spinner-border"></div>
243
+ <p class="spinner-text">Loading...</p>
244
+ <p id="loading-message">Processing...</p>
245
+ </div>
246
+ </div>
247
+
248
+ <!-- Scripts -->
249
+ <script src="/static/js/api.js"></script>
250
+ <script src="/static/js/ui.js"></script>
251
+ <script src="/static/js/chat.js"></script>
252
+ <script src="/static/js/main.js"></script>
253
+
254
+ <!-- Custom Tab JS -->
255
+ <script>
256
+ // Custom tab functionality (without Bootstrap dependency)
257
+ document.addEventListener('DOMContentLoaded', function() {
258
+ const tabLinks = document.querySelectorAll('.nav-link');
259
+
260
+ tabLinks.forEach(tabLink => {
261
+ tabLink.addEventListener('click', function(e) {
262
+ e.preventDefault();
263
+
264
+ // Remove active class from all tabs and panes
265
+ document.querySelectorAll('.nav-link').forEach(tab => {
266
+ tab.classList.remove('active');
267
+ tab.setAttribute('aria-selected', 'false');
268
+ });
269
+
270
+ document.querySelectorAll('.tab-pane').forEach(pane => {
271
+ pane.classList.remove('active');
272
+ });
273
+
274
+ // Add active class to clicked tab and its pane
275
+ this.classList.add('active');
276
+ this.setAttribute('aria-selected', 'true');
277
+
278
+ const targetId = this.getAttribute('data-tab');
279
+ document.getElementById(targetId).classList.add('active');
280
+ });
281
+ });
282
+ });
283
+ </script>
284
+
285
+ <!-- Citations Modal -->
286
+ <div id="citations-modal" class="modal">
287
+ <div class="modal-content">
288
+ <div class="modal-header">
289
+ <h3>Sources for Query</h3>
290
+ <span class="close-modal">&times;</span>
291
+ </div>
292
+ <div class="modal-body">
293
+ <p id="citation-query" class="citation-query"></p>
294
+ <div id="citations-container" class="citations-container">
295
+ <!-- Citations will be rendered here -->
296
+ </div>
297
+ </div>
298
+ <div class="modal-footer">
299
+ <button class="btn btn-secondary close-btn">Close</button>
300
+ </div>
301
+ </div>
302
+ </div>
303
+ </body>
304
+ </html>
static/js/api.js ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * API Service for the arXivCSRAG application
3
+ * Manages all API requests to the backend
4
+ */
5
+
6
+ class ApiService {
7
+ /**
8
+ * Configure API keys
9
+ * @param {string} geminiApiKey - The Google Gemini API key
10
+ * @param {string} huggingfaceToken - The Hugging Face token
11
+ * @returns {Promise} - The API response
12
+ */
13
+ static async configureApiKeys(geminiApiKey, huggingfaceToken) {
14
+ try {
15
+ const response = await fetch('/api/configure', {
16
+ method: 'POST',
17
+ headers: {
18
+ 'Content-Type': 'application/json'
19
+ },
20
+ body: JSON.stringify({
21
+ gemini_api_key : geminiApiKey,
22
+ huggingface_token: huggingfaceToken
23
+ })
24
+ });
25
+
26
+ return await response.json();
27
+ } catch (error) {
28
+ console.error('Error configuring API keys:', error);
29
+ throw error;
30
+ }
31
+ }
32
+
33
+
34
+ /**
35
+ * Fetch papers from arXiv
36
+ * @param {Object} searchParams - The search parameters
37
+ * @returns {Promise} - The API response
38
+ */
39
+ static async fetchPapers(searchParams) {
40
+ try {
41
+ const response = await fetch('/api/fetch-papers', {
42
+ method: 'POST',
43
+ headers: {
44
+ 'Content-Type': 'application/json'
45
+ },
46
+ body: JSON.stringify(searchParams)
47
+ });
48
+
49
+ return await response.json();
50
+ } catch (error) {
51
+ console.error('Error fetching papers:', error);
52
+ throw error;
53
+ }
54
+ }
55
+
56
+
57
+ /**
58
+ * Get paper metadata
59
+ * @param {string} arxivId - The arXiv ID of the paper
60
+ * @returns {Promise} - The API response
61
+ */
62
+ static async getPaperMetadata(arxivId) {
63
+ try {
64
+ const response = await fetch('/api/paper-metadata', {
65
+ method: 'POST',
66
+ headers: {
67
+ 'Content-Type': 'application/json'
68
+ },
69
+ body: JSON.stringify({
70
+ arxiv_id: arxivId
71
+ })
72
+ });
73
+
74
+ return await response.json();
75
+ } catch (error) {
76
+ console.error('Error getting paper metadata:', error);
77
+ throw error;
78
+ }
79
+ }
80
+
81
+
82
+ /**
83
+ * Download a paper
84
+ * @param {string} arxivId - The arXiv ID of the paper
85
+ * @returns {Promise} - The API response
86
+ */
87
+ static async downloadPaper(arxivId) {
88
+ try {
89
+ const response = await fetch('/api/download-paper', {
90
+ method: 'POST',
91
+ headers: {
92
+ 'Content-Type': 'application/json'
93
+ },
94
+ body: JSON.stringify({
95
+ arxiv_id: arxivId
96
+ })
97
+ });
98
+
99
+ return await response.json();
100
+ } catch (error) {
101
+ console.error('Error downloading paper:', error);
102
+ throw error;
103
+ }
104
+ }
105
+
106
+
107
+ /**
108
+ * Upload a paper
109
+ * @param {File} file - The PDF file to upload
110
+ * @returns {Promise} - The API response
111
+ */
112
+ static async uploadPaper(file) {
113
+ try {
114
+ const formData = new FormData();
115
+ formData.append('file', file);
116
+
117
+ const response = await fetch('/api/upload-paper', {
118
+ method: 'POST',
119
+ body : formData
120
+ });
121
+
122
+ return await response.json();
123
+ } catch (error) {
124
+ console.error('Error uploading paper:', error);
125
+ throw error;
126
+ }
127
+ }
128
+
129
+
130
+ /**
131
+ * Process a paper for RAG
132
+ * @param {string} filePath - The path to the PDF file
133
+ * @returns {Promise} - The API response
134
+ */
135
+ static async processPaper(filePath) {
136
+ try {
137
+ const formData = new FormData();
138
+ formData.append('file_path', filePath);
139
+
140
+ const response = await fetch('/api/process-paper', {
141
+ method: 'POST',
142
+ body : formData
143
+ });
144
+
145
+ return await response.json();
146
+ } catch (error) {
147
+ console.error('Error processing paper:', error);
148
+ throw error;
149
+ }
150
+ }
151
+
152
+
153
+ /**
154
+ * Chat with a processed paper
155
+ * @param {string} message - The user's message
156
+ * @returns {Promise} - The API response
157
+ */
158
+ static async chatWithPaper(message) {
159
+ try {
160
+ const response = await fetch('/api/chat', {
161
+ method: 'POST',
162
+ headers: {
163
+ 'Content-Type': 'application/json'
164
+ },
165
+ body: JSON.stringify({
166
+ message: message
167
+ })
168
+ });
169
+
170
+ return await response.json();
171
+ } catch (error) {
172
+ console.error('Error chatting with paper:', error);
173
+ throw error;
174
+ }
175
+ }
176
+
177
+
178
+ /**
179
+ * Reset the chat
180
+ * @returns {Promise} - The API response
181
+ */
182
+ static async resetChat() {
183
+ try {
184
+ const response = await fetch('/api/reset-chat', {
185
+ method: 'POST'
186
+ });
187
+
188
+ return await response.json();
189
+ } catch (error) {
190
+ console.error('Error resetting chat:', error);
191
+ throw error;
192
+ }
193
+ }
194
+
195
+
196
+ /**
197
+ * Fetch citations for a specific query
198
+ * @param {string} message - The query message
199
+ * @returns {Promise} - The API response with citations
200
+ */
201
+ static async fetchCitations(message) {
202
+ try {
203
+ const response = await fetch('/api/fetch-citations', {
204
+ method: 'POST',
205
+ headers: {
206
+ 'Content-Type': 'application/json'
207
+ },
208
+ body: JSON.stringify({
209
+ message: message
210
+ })
211
+ });
212
+
213
+ return await response.json();
214
+ } catch (error) {
215
+ console.error('Error fetching citations:', error);
216
+ throw error;
217
+ }
218
+ }
219
+ }
static/js/chat.js ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Chat Manager for the arXivCSRAG application
3
+ * Handles chat interactions and state
4
+ */
5
+
6
+ class ChatManager {
7
+ constructor(uiManager) {
8
+ this.uiManager = uiManager;
9
+ this.currentPaperPath = null;
10
+ this.isProcessed = false;
11
+
12
+ // Initialize chat events
13
+ this.initEvents();
14
+ }
15
+
16
+ /**
17
+ * Initialize chat event listeners
18
+ */
19
+ initEvents() {
20
+ const chatInput = document.getElementById('chat-input');
21
+ const sendButton = document.getElementById('send-message-btn');
22
+ const resetButton = document.getElementById('reset-chat-btn');
23
+
24
+ // Send message on button click
25
+ sendButton.addEventListener('click', () => {
26
+ this.sendMessage();
27
+ });
28
+
29
+ // Send message on Enter key (but allow Shift+Enter for new lines)
30
+ chatInput.addEventListener('keydown', (event) => {
31
+ if (event.key === 'Enter' && !event.shiftKey) {
32
+ event.preventDefault();
33
+ this.sendMessage();
34
+ }
35
+ });
36
+
37
+ // Reset chat
38
+ resetButton.addEventListener('click', () => {
39
+ this.resetChat();
40
+ });
41
+ }
42
+
43
+ /**
44
+ * Send a message to the API
45
+ */
46
+ async sendMessage() {
47
+ const chatInput = document.getElementById('chat-input');
48
+ const message = chatInput.value.trim();
49
+
50
+ if (!message || !this.isProcessed) {
51
+ return;
52
+ }
53
+
54
+ // Add user message to chat
55
+ this.uiManager.addChatMessage(message, true);
56
+
57
+ // Clear input
58
+ chatInput.value = '';
59
+
60
+ // Disable chat while waiting for response
61
+ this.uiManager.disableChat();
62
+ this.uiManager.showLoading('Getting answer...');
63
+
64
+ try {
65
+ // Send message to API
66
+ const response = await ApiService.chatWithPaper(message);
67
+
68
+ if (response.status === 'success') {
69
+ // Add bot response to chat
70
+ this.uiManager.addChatMessage(response.response, false, response.citations);
71
+ } else {
72
+ this.uiManager.addSystemMessage('Error: Failed to get a response. Please try again.');
73
+ }
74
+ } catch (error) {
75
+ console.error('Error in chat:', error);
76
+ this.uiManager.addSystemMessage('Error: An unexpected error occurred. Please try again.');
77
+ } finally {
78
+ // Re-enable chat
79
+ this.uiManager.enableChat();
80
+ this.uiManager.hideLoading();
81
+ }
82
+ }
83
+
84
+
85
+ /**
86
+ * Process a paper for chatting
87
+ * @param {string} filePath - Path to the paper file
88
+ */
89
+ async processPaper(filePath) {
90
+ if (!filePath) {
91
+ this.uiManager.addSystemMessage('Error: No paper file specified.');
92
+ return;
93
+ }
94
+
95
+ this.currentPaperPath = filePath;
96
+ this.uiManager.clearChat();
97
+ this.uiManager.disableChat();
98
+ this.uiManager.showLoading('Processing paper... This may take about 1-2 minutes.');
99
+
100
+ try {
101
+ const response = await ApiService.processPaper(filePath);
102
+
103
+ if (response.status === 'success') {
104
+ this.isProcessed = true;
105
+ this.uiManager.enableChat();
106
+ this.uiManager.addSystemMessage(
107
+ `Extracted ${response.stats.texts} text chunks, ${response.stats.tables} tables, and ${response.stats.images} images.<br>
108
+ Paper processed successfully! You can now ask questions about the paper.<br>
109
+ <span class="memory-info">🧠 The chatbot now has conversational memory and will remember your previous questions.</span>`
110
+ );
111
+ } else {
112
+ this.uiManager.addSystemMessage('Error: Failed to process paper. Please try again.');
113
+ }
114
+ } catch (error) {
115
+ console.error('Error processing paper:', error);
116
+ this.uiManager.addSystemMessage('Error: An unexpected error occurred while processing the paper. Please try again.');
117
+ } finally {
118
+ this.uiManager.hideLoading();
119
+ }
120
+ }
121
+
122
+
123
+ /**
124
+ * Reset the chat session
125
+ */
126
+ async resetChat() {
127
+ this.uiManager.showLoading('Resetting chat and conversation memory...');
128
+
129
+ try {
130
+ await ApiService.resetChat();
131
+ this.isProcessed = false;
132
+ this.currentPaperPath = null;
133
+ this.uiManager.clearChat();
134
+ this.uiManager.disableChat();
135
+ this.uiManager.addSystemMessage('Select a paper from the search results or upload a PDF to start chatting. Conversation memory has been reset.');
136
+ } catch (error) {
137
+ console.error('Error resetting chat:', error);
138
+ this.uiManager.addSystemMessage('Error: Failed to reset chat. Please try again.');
139
+ } finally {
140
+ this.uiManager.hideLoading();
141
+ }
142
+ }
143
+
144
+
145
+ /**
146
+ * Download and process a paper by arXiv ID
147
+ * @param {string} arxivId - The arXiv ID of the paper
148
+ */
149
+ async downloadAndProcessPaper(arxivId) {
150
+ this.uiManager.showLoading('Downloading paper...');
151
+
152
+ try {
153
+ const response = await ApiService.downloadPaper(arxivId);
154
+
155
+ if (response.status === 'success') {
156
+ this.uiManager.hideModal(this.uiManager.paperInfoModal);
157
+ await this.processPaper(response.file_path);
158
+ } else {
159
+ this.uiManager.addSystemMessage('Error: Failed to download paper. Please try again.');
160
+ this.uiManager.hideLoading();
161
+ }
162
+ } catch (error) {
163
+ console.error('Error downloading paper:', error);
164
+ this.uiManager.addSystemMessage('Error: An unexpected error occurred while downloading the paper. Please try again.');
165
+ this.uiManager.hideLoading();
166
+ }
167
+ }
168
+
169
+
170
+ /**
171
+ * Fetch and display citations for a specific message
172
+ * @param {string} message - The message to get citations for
173
+ */
174
+ async fetchAndDisplayCitations(message) {
175
+ if (!message || !this.isProcessed) {
176
+ return;
177
+ }
178
+
179
+ this.uiManager.showLoading('Fetching sources...');
180
+
181
+ try {
182
+ // Fetch citations from API
183
+ const response = await ApiService.fetchCitations(message);
184
+
185
+ if (response.status === 'success') {
186
+ // Create a modal to display the citations
187
+ this.uiManager.showCitationsModal(message, response.citations);
188
+ } else {
189
+ this.uiManager.addSystemMessage('Error: Failed to fetch citations. Please try again.');
190
+ }
191
+ } catch (error) {
192
+ console.error('Error fetching citations:', error);
193
+ this.uiManager.addSystemMessage('Error: An unexpected error occurred while fetching citations.');
194
+ } finally {
195
+ this.uiManager.hideLoading();
196
+ }
197
+ }
198
+ }
static/js/main.js ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Main application entry point for arXivCSRAG
3
+ */
4
+
5
+ document.addEventListener('DOMContentLoaded', () => {
6
+ // Initialize UI and Chat managers
7
+ const uiManager = new UIManager();
8
+ const chatManager = new ChatManager(uiManager);
9
+
10
+
11
+ // API Keys configuration
12
+ const saveApiKeysBtn = document.getElementById('save-api-keys-btn');
13
+ saveApiKeysBtn.addEventListener('click', async () => {
14
+ const geminiApiKey = document.getElementById('gemini-api-key').value.trim();
15
+ const huggingfaceToken = document.getElementById('huggingface-token').value.trim();
16
+
17
+ if (!geminiApiKey || !huggingfaceToken) {
18
+ alert('Please enter both API keys.');
19
+ return;
20
+ }
21
+
22
+ uiManager.showLoading('Configuring API keys...');
23
+
24
+ try {
25
+ const response = await ApiService.configureApiKeys(geminiApiKey, huggingfaceToken);
26
+
27
+ if (response.status === 'success') {
28
+ uiManager.hideModal(uiManager.apiKeysModal);
29
+ alert('API keys configured successfully!');
30
+ } else {
31
+ alert('Failed to configure API keys. Please try again.');
32
+ }
33
+ } catch (error) {
34
+ console.error('Error saving API keys:', error);
35
+ alert('An unexpected error occurred. Please try again.');
36
+ } finally {
37
+ uiManager.hideLoading();
38
+ }
39
+ });
40
+
41
+
42
+ // Paper search functionality
43
+ const searchButton = document.getElementById('search-button');
44
+ searchButton.addEventListener('click', async () => {
45
+ // Get search parameters
46
+ const subjectTags = Array.from(document.getElementById('subject-tags-select').selectedOptions).map(option => option.value);
47
+ const startDate = document.getElementById('start-date').value;
48
+ const endDate = document.getElementById('end-date').value;
49
+ const maxResults = parseInt(document.getElementById('max-results').value) || 10;
50
+ const query = document.getElementById('search-query').value.trim();
51
+
52
+ // if (!query) {
53
+ // alert('Please enter a search query.');
54
+ // return;
55
+ // }
56
+
57
+ uiManager.showLoading('Searching for papers...');
58
+
59
+ try {
60
+ const response = await ApiService.fetchPapers({
61
+ subject_tags: subjectTags.length > 0 ? subjectTags : null,
62
+ start_date : startDate || null,
63
+ end_date : endDate || null,
64
+ max_results : maxResults,
65
+ query : query
66
+ });
67
+
68
+ if (response.status === 'success') {
69
+ uiManager.renderSearchResults(response.papers);
70
+ } else {
71
+ alert('Failed to fetch papers. Please try again.');
72
+ }
73
+ } catch (error) {
74
+ console.error('Error searching papers:', error);
75
+ alert('An unexpected error occurred. Please try again.');
76
+ } finally {
77
+ uiManager.hideLoading();
78
+ }
79
+ });
80
+
81
+
82
+ // Upload paper functionality
83
+ const uploadButton = document.getElementById('upload-button');
84
+ const fileInput = document.getElementById('pdf-upload');
85
+ const fileNameDisplay = document.querySelector('.file-name-display');
86
+ const selectedFileName = document.getElementById('selected-file-name');
87
+
88
+ // Handle file selection display
89
+ fileInput.addEventListener('change', (event) => {
90
+ const file = fileInput.files[0];
91
+ console.log('File selected:', file ? file.name : 'No file');
92
+
93
+ if (file) {
94
+ selectedFileName.textContent = file.name;
95
+ fileNameDisplay.classList.add('active');
96
+ uploadButton.classList.add('file-selected');
97
+ uploadButton.innerHTML = '<i class="bi bi-upload"></i> Upload "' + file.name.substring(0, 15) + (file.name.length > 15 ? '...' : '') + '"';
98
+ console.log('File name display should be visible now');
99
+ } else {
100
+ selectedFileName.textContent = 'No file selected';
101
+ fileNameDisplay.classList.remove('active');
102
+ uploadButton.classList.remove('file-selected');
103
+ uploadButton.innerHTML = '<i class="bi bi-upload"></i> Upload PDF';
104
+ }
105
+ });
106
+
107
+ uploadButton.addEventListener('click', async () => {
108
+ const file = fileInput.files[0];
109
+
110
+ if (!file) {
111
+ alert('Please select a PDF file to upload.');
112
+ return;
113
+ }
114
+
115
+ if (file.type !== 'application/pdf') {
116
+ alert('Please select a valid PDF file.');
117
+ return;
118
+ }
119
+
120
+ uiManager.showLoading('Uploading paper...');
121
+
122
+ try {
123
+ const response = await ApiService.uploadPaper(file);
124
+
125
+ if (response.status === 'success') {
126
+ await chatManager.processPaper(response.file_path);
127
+ } else {
128
+ alert('Failed to upload paper. Please try again.');
129
+ uiManager.hideLoading();
130
+ }
131
+ } catch (error) {
132
+ console.error('Error uploading paper:', error);
133
+ alert('An unexpected error occurred. Please try again.');
134
+ uiManager.hideLoading();
135
+ }
136
+ });
137
+
138
+
139
+ // Paper info modal buttons
140
+ document.getElementById('open-arxiv-btn').addEventListener('click', function() {
141
+ const url = this.dataset.url;
142
+ if (url) {
143
+ window.open(url, '_blank');
144
+ }
145
+ });
146
+
147
+ document.getElementById('view-pdf-btn').addEventListener('click', function() {
148
+ const url = this.dataset.url;
149
+ if (url) {
150
+ window.open(url, '_blank');
151
+ }
152
+ });
153
+
154
+ document.getElementById('download-pdf-btn').addEventListener('click', function() {
155
+ const paperId = this.dataset.paperId;
156
+ if (paperId) {
157
+ // Create a temporary link to download the file
158
+ uiManager.showLoading('Preparing download...');
159
+
160
+ ApiService.downloadPaper(paperId)
161
+ .then(response => {
162
+ if (response.status === 'success') {
163
+ // Create a link to download the file directly
164
+ const a = document.createElement('a');
165
+ a.href = response.file_path;
166
+ a.download = `${paperId.replace('/', '_')}.pdf`;
167
+ document.body.appendChild(a);
168
+ a.click();
169
+ document.body.removeChild(a);
170
+ } else {
171
+ alert('Failed to download paper. Please try again.');
172
+ }
173
+ })
174
+ .catch(error => {
175
+ console.error('Error downloading paper:', error);
176
+ alert('An unexpected error occurred. Please try again.');
177
+ })
178
+ .finally(() => {
179
+ uiManager.hideLoading();
180
+ });
181
+ }
182
+ });
183
+
184
+ document.getElementById('chat-with-paper-btn').addEventListener('click', function() {
185
+ const paperId = this.dataset.paperId;
186
+ if (paperId) {
187
+ chatManager.downloadAndProcessPaper(paperId);
188
+ }
189
+ });
190
+
191
+
192
+ // Show API keys modal on first load
193
+ setTimeout(() => {
194
+ uiManager.showModal(uiManager.apiKeysModal);
195
+ }, 500);
196
+ });
static/js/ui.js ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * UI Manager for the arXivCSRAG application
3
+ * Handles UI elements, modals, and rendering
4
+ */
5
+
6
+ class UIManager {
7
+ constructor() {
8
+ // Cache DOM elements
9
+ this.apiKeysModal = document.getElementById('api-keys-modal');
10
+ this.paperInfoModal = document.getElementById('paper-info-modal');
11
+ this.citationsModal = document.getElementById('citations-modal');
12
+ this.loadingOverlay = document.getElementById('loading-overlay');
13
+ this.loadingMessage = document.getElementById('loading-message');
14
+ this.resultsContainer = document.getElementById('results-container');
15
+ this.chatMessages = document.getElementById('chat-messages');
16
+ this.subjectTagsSelect = document.getElementById('subject-tags-select');
17
+
18
+ // Initialize UI
19
+ this.initEventListeners();
20
+ this.loadSubjectTags();
21
+ }
22
+
23
+
24
+ /**
25
+ * Initialize event listeners for UI elements
26
+ */
27
+ initEventListeners() {
28
+ // Modal close buttons
29
+ document.querySelectorAll('.close-modal, .close-btn').forEach(button => {
30
+ button.addEventListener('click', () => {
31
+ this.hideModal(this.apiKeysModal);
32
+ this.hideModal(this.paperInfoModal);
33
+ this.hideModal(this.citationsModal);
34
+ });
35
+ });
36
+
37
+ // Configure API Keys button
38
+ document.getElementById('configure-api-keys-btn').addEventListener('click', () => {
39
+ this.showModal(this.apiKeysModal);
40
+ });
41
+
42
+ // Window click to close modals
43
+ window.addEventListener('click', (event) => {
44
+ if (event.target === this.apiKeysModal) {
45
+ this.hideModal(this.apiKeysModal);
46
+ } else if (event.target === this.paperInfoModal) {
47
+ this.hideModal(this.paperInfoModal);
48
+ } else if (event.target === this.citationsModal) {
49
+ this.hideModal(this.citationsModal);
50
+ }
51
+ });
52
+ }
53
+
54
+
55
+ /**
56
+ * Load subject tags from JSON file
57
+ */
58
+ async loadSubjectTags() {
59
+ try {
60
+ const response = await fetch('/data/arxiv_cs_subjects.json');
61
+ const subjects = await response.json();
62
+
63
+ // Clear existing options
64
+ this.subjectTagsSelect.innerHTML = '';
65
+
66
+ // Add options to select
67
+ subjects.forEach(subject => {
68
+ const option = document.createElement('option');
69
+ option.value = subject.tag;
70
+ option.textContent = `${subject.tag}: ${subject.name}`;
71
+ this.subjectTagsSelect.appendChild(option);
72
+ });
73
+ } catch (error) {
74
+ console.error('Error loading subject tags:', error);
75
+ }
76
+ }
77
+
78
+
79
+ /**
80
+ * Show a modal
81
+ * @param {HTMLElement} modal - The modal to show
82
+ */
83
+ showModal(modal) {
84
+ modal.style.display = 'block';
85
+ }
86
+
87
+
88
+ /**
89
+ * Hide a modal
90
+ * @param {HTMLElement} modal - The modal to hide
91
+ */
92
+ hideModal(modal) {
93
+ modal.style.display = 'none';
94
+ }
95
+
96
+
97
+ /**
98
+ * Show the loading overlay
99
+ * @param {string} message - The loading message to display
100
+ */
101
+ showLoading(message = 'Processing...') {
102
+ this.loadingMessage.textContent = message;
103
+ this.loadingOverlay.style.display = 'flex';
104
+ }
105
+
106
+
107
+ /**
108
+ * Hide the loading overlay
109
+ */
110
+ hideLoading() {
111
+ this.loadingOverlay.style.display = 'none';
112
+ }
113
+
114
+
115
+ /**
116
+ * Render search results
117
+ * @param {Array} papers - The papers to render
118
+ */
119
+ renderSearchResults(papers) {
120
+ if (!papers || papers.length === 0) {
121
+ this.resultsContainer.innerHTML = '<p class="no-results">No papers found matching your criteria.</p>';
122
+ return;
123
+ }
124
+
125
+ this.resultsContainer.innerHTML = '';
126
+ papers.forEach(paper => {
127
+ const paperElement = document.createElement('div');
128
+ paperElement.className = 'paper-item';
129
+ paperElement.dataset.paperId = paper.arxiv_id;
130
+
131
+ const categories = paper.categories.map(cat =>
132
+ `<span class="paper-category">${cat}</span>`
133
+ ).join('');
134
+
135
+ paperElement.innerHTML = `
136
+ <div class="paper-item-header">
137
+ <h4 class="paper-title">${paper.title}</h4>
138
+ <span class="paper-date">${paper.published}</span>
139
+ </div>
140
+ <p class="paper-authors">${paper.authors.join(', ')}</p>
141
+ <div class="paper-categories">${categories}</div>
142
+ `;
143
+
144
+ paperElement.addEventListener('click', () => {
145
+ this.showPaperInfo(paper);
146
+ });
147
+
148
+ this.resultsContainer.appendChild(paperElement);
149
+ });
150
+ }
151
+
152
+
153
+ /**
154
+ * Show paper information in the modal
155
+ * @param {Object} paper - The paper data
156
+ */
157
+ showPaperInfo(paper) {
158
+ // Set modal content
159
+ document.getElementById('paper-title').textContent = paper.title;
160
+ document.getElementById('paper-authors').textContent = paper.authors.join(', ');
161
+ document.getElementById('paper-published').textContent = paper.published;
162
+ document.getElementById('paper-categories').textContent = paper.categories.join(', ');
163
+ document.getElementById('paper-id').textContent = paper.arxiv_id;
164
+ document.getElementById('paper-abstract').textContent = paper.abstract;
165
+
166
+ // Set data attributes for buttons
167
+ const openArxivBtn = document.getElementById('open-arxiv-btn');
168
+ const viewPdfBtn = document.getElementById('view-pdf-btn');
169
+ const downloadPdfBtn = document.getElementById('download-pdf-btn');
170
+ const chatWithPaperBtn = document.getElementById('chat-with-paper-btn');
171
+
172
+ openArxivBtn.dataset.url = paper.entry_id;
173
+ viewPdfBtn.dataset.url = paper.pdf_url;
174
+ downloadPdfBtn.dataset.paperId = paper.arxiv_id;
175
+ chatWithPaperBtn.dataset.paperId = paper.arxiv_id;
176
+
177
+ // Show the modal
178
+ this.showModal(this.paperInfoModal);
179
+ }
180
+
181
+
182
+ /**
183
+ * Add a message to the chat interface
184
+ * @param {string} message - The message text
185
+ * @param {boolean} isUser - Whether the message is from the user
186
+ * @param {Object} citations - Citations from the AI response
187
+ */
188
+ addChatMessage(message, isUser, citations = null) {
189
+ const messageElement = document.createElement('div');
190
+ messageElement.className = `message ${isUser ? 'user-message' : 'bot-message'}`;
191
+
192
+ // If it's a bot message, parse markdown (simple version)
193
+ if (!isUser) {
194
+ // Simple markdown parsing for links, bold, italic, code
195
+ const formattedMessage = message
196
+ .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
197
+ .replace(/\*(.*?)\*/g, '<em>$1</em>')
198
+ .replace(/`(.*?)`/g, '<code>$1</code>')
199
+ .replace(/\n/g, '<br>');
200
+
201
+ messageElement.innerHTML = formattedMessage; // Add citations if available
202
+ if (citations && (citations.texts.length > 0 || citations.images.length > 0 || citations.tables.length > 0)) {
203
+ const citationsElement = document.createElement('div');
204
+ citationsElement.className = 'citations';
205
+
206
+ // Add sources section with View All button
207
+ const sourcesHeader = document.createElement('div');
208
+ sourcesHeader.className = 'citations-header';
209
+ sourcesHeader.innerHTML = `
210
+ <button class="view-all-citations-btn">View Sources</button>
211
+ `;
212
+ citationsElement.appendChild(sourcesHeader);
213
+
214
+ messageElement.appendChild(citationsElement);
215
+
216
+ // Add view all functionality
217
+ const viewAllBtn = sourcesHeader.querySelector('.view-all-citations-btn');
218
+ viewAllBtn.addEventListener('click', () => {
219
+ this.showCitationsModal('Response to: ' + message, citations);
220
+ });
221
+ }
222
+ } else {
223
+ messageElement.textContent = message;
224
+ }
225
+
226
+ this.chatMessages.appendChild(messageElement);
227
+ this.chatMessages.scrollTop = this.chatMessages.scrollHeight;
228
+ }
229
+
230
+
231
+ /**
232
+ * Clear the chat messages
233
+ */
234
+ clearChat() {
235
+ this.chatMessages.innerHTML = '';
236
+ this.addSystemMessage('Chat reset. You can now ask questions about the paper.');
237
+ }
238
+
239
+
240
+ /**
241
+ * Add a system message to the chat
242
+ * @param {string} message - The system message
243
+ */
244
+ addSystemMessage(message) {
245
+ const systemMessage = document.createElement('div');
246
+ systemMessage.className = 'system-message';
247
+ systemMessage.innerHTML = `<p>${message}</p>`;
248
+ this.chatMessages.appendChild(systemMessage);
249
+ }
250
+
251
+
252
+ /**
253
+ * Enable chat functionality
254
+ */
255
+ enableChat() {
256
+ document.getElementById('chat-input').disabled = false;
257
+ document.getElementById('send-message-btn').disabled = false;
258
+ document.getElementById('reset-chat-btn').disabled = false;
259
+ }
260
+
261
+
262
+ /**
263
+ * Disable chat functionality
264
+ */
265
+ disableChat() {
266
+ document.getElementById('chat-input').disabled = true;
267
+ document.getElementById('send-message-btn').disabled = true;
268
+ document.getElementById('reset-chat-btn').disabled = true;
269
+ }
270
+
271
+
272
+ /**
273
+ * Show citations in a modal
274
+ * @param {string} query - The query that generated these citations
275
+ * @param {Object} citations - The citations object with texts, tables, and images
276
+ */
277
+ showCitationsModal(query, citations) {
278
+ // Cache DOM elements
279
+ const citationsModal = document.getElementById('citations-modal');
280
+ const citationQuery = document.getElementById('citation-query');
281
+ const citationsContainer = document.getElementById('citations-container');
282
+
283
+ // Set the query
284
+ citationQuery.textContent = `"${query}"`;
285
+
286
+ // Clear previous citations
287
+ citationsContainer.innerHTML = '';
288
+
289
+ // Add text citations
290
+ if (citations.texts.length > 0) {
291
+ const textSection = document.createElement('div');
292
+ textSection.className = 'citation-section';
293
+ textSection.innerHTML = '<h4>Text Excerpts</h4>';
294
+
295
+ const textList = document.createElement('div');
296
+ textList.className = 'citation-list';
297
+
298
+ citations.texts.forEach((text, index) => {
299
+ const textItem = document.createElement('div');
300
+ textItem.className = 'text-citation';
301
+ textItem.innerHTML = `<div class="citation-number">${index + 1}</div><div class="citation-text">${text}</div>`;
302
+ textList.appendChild(textItem);
303
+ });
304
+
305
+ textSection.appendChild(textList);
306
+ citationsContainer.appendChild(textSection);
307
+ }
308
+
309
+ // Add table citations
310
+ if (citations.tables.length > 0) {
311
+ const tableSection = document.createElement('div');
312
+ tableSection.className = 'citation-section';
313
+ tableSection.innerHTML = '<h4>Tables</h4>';
314
+
315
+ const tableList = document.createElement('div');
316
+ tableList.className = 'citation-list';
317
+
318
+ citations.tables.forEach((tableHtml, index) => {
319
+ const tableItem = document.createElement('div');
320
+ tableItem.className = 'table-citation';
321
+
322
+ // Wrap the table in a container for better responsiveness
323
+ tableItem.innerHTML = `
324
+ <div class="citation-number">${index + 1}</div>
325
+ <div class="citation-table-container">
326
+ <div class="citation-table">${tableHtml}</div>
327
+ </div>
328
+ `;
329
+ tableList.appendChild(tableItem);
330
+ });
331
+
332
+ tableSection.appendChild(tableList);
333
+ citationsContainer.appendChild(tableSection);
334
+ }
335
+
336
+ // Add image citations
337
+ if (citations.images.length > 0) {
338
+ const imageSection = document.createElement('div');
339
+ imageSection.className = 'citation-section';
340
+ imageSection.innerHTML = '<h4>Images</h4>';
341
+
342
+ const imageList = document.createElement('div');
343
+ imageList.className = 'citation-list';
344
+
345
+ citations.images.forEach((imageBase64, index) => {
346
+ const imageItem = document.createElement('div');
347
+ imageItem.className = 'image-citation';
348
+ imageItem.innerHTML = `
349
+ <div class="citation-number">${index + 1}</div>
350
+ <div class="citation-image-container">
351
+ <img src="data:image/jpeg;base64,${imageBase64}" alt="Citation image ${index + 1}" class="responsive-image" loading="lazy">
352
+ </div>
353
+ `;
354
+ imageList.appendChild(imageItem);
355
+ });
356
+
357
+ imageSection.appendChild(imageList);
358
+ citationsContainer.appendChild(imageSection);
359
+ }
360
+
361
+ // Show the modal
362
+ this.showModal(citationsModal);
363
+ }
364
+ }
utils/__pycache__/setup_logger.cpython-310.pyc ADDED
Binary file (939 Bytes). View file
 
utils/setup_logger.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Logger setup utilities.
3
+ """
4
+ import logging
5
+
6
+ def setup_logger(name: str = __name__) -> logging.Logger:
7
+ """
8
+ Set up logger based on the module name. Ensures:
9
+ - No duplicate handlers are added.
10
+ - Propagation to the root logger is disabled.
11
+ - A standard formatter is applied.
12
+ """
13
+ logger = logging.getLogger(name)
14
+ logger.setLevel(logging.INFO)
15
+
16
+ # Remove all old handlers if they exist (prevents duplicate logs on reload)
17
+ if logger.hasHandlers():
18
+ logger.handlers.clear()
19
+
20
+ # Turn off propagation to the root logger
21
+ logger.propagate = False
22
+
23
+ # Create console handler
24
+ console_handler = logging.StreamHandler()
25
+ formatter = logging.Formatter(
26
+ fmt = "[%(levelname)s]\t%(name)s\t%(funcName)s\t%(message)s",
27
+ datefmt = "%Y-%m-%d %H:%M:%S"
28
+ )
29
+ console_handler.setFormatter(formatter)
30
+
31
+ logger.addHandler(console_handler)
32
+ return logger