Intial design and tests
Browse files- .env +14 -0
- InitialDesign.png +0 -0
- README.md +31 -0
- paperflux/__init__.py +0 -0
- poetry.lock +0 -0
- pyproject.toml +27 -0
- tests/README.md +3 -0
- tests/api_test.py +60 -0
- tests/pdf_workflow_test.py +92 -0
.env
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face
|
| 2 |
+
HF_API_KEY=your_huggingface_key
|
| 3 |
+
|
| 4 |
+
# Gemini
|
| 5 |
+
GEMINI_API_KEY=your_gemini_key
|
| 6 |
+
|
| 7 |
+
# Redis
|
| 8 |
+
REDIS_HOST=localhost
|
| 9 |
+
REDIS_PORT=6379
|
| 10 |
+
REDIS_DB=0
|
| 11 |
+
|
| 12 |
+
# App Config
|
| 13 |
+
DAILY_PAPER_LIMIT=
|
| 14 |
+
CACHE_TTL=86400
|
InitialDesign.png
ADDED
|
README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
```
|
| 2 |
+
paperflux/
|
| 3 |
+
├── .env.example
|
| 4 |
+
├── pyproject.toml
|
| 5 |
+
├── poetry.lock
|
| 6 |
+
├── README.md
|
| 7 |
+
├── .gitignore
|
| 8 |
+
├── src/
|
| 9 |
+
│ ├── __init__.py
|
| 10 |
+
│ ├── tools/
|
| 11 |
+
│ │ ├── __init__.py
|
| 12 |
+
│ │ ├── hf_tools/
|
| 13 |
+
│ │ │ ├── __init__.py
|
| 14 |
+
│ │ │ ├── paper_pdf_tool.py
|
| 15 |
+
│ │ │ └── summarization_tool.py
|
| 16 |
+
│ │ ├── cache/
|
| 17 |
+
│ │ │ ├── __init__.py
|
| 18 |
+
│ │ │ ├── redis_client.py # Core Redis operations
|
| 19 |
+
│ │ │ └── cache_interface.py # Abstract base class
|
| 20 |
+
│ │ └── cache_manager.py # High-level cache operations
|
| 21 |
+
│ ├── agents/
|
| 22 |
+
│ │ ├── __init__.py
|
| 23 |
+
│ │ └── agent.py
|
| 24 |
+
│ ├── models/
|
| 25 |
+
│ │ ├── __init__.py
|
| 26 |
+
│ │ └── model.py # Pydantic models for data validation
|
| 27 |
+
│ │── scheduler.py # Scheduled cache updates
|
| 28 |
+
| └── app.py # gradio web app
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
``` Above is agentic workflow design, initial workflow will be using gemini api key and will be extended to agentic system ```
|
paperflux/__init__.py
ADDED
|
File without changes
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "paperflux"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = [
|
| 6 |
+
{name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"}
|
| 7 |
+
]
|
| 8 |
+
license = {text = "MIT"}
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"gradio (>=5.16.0,<6.0.0)",
|
| 13 |
+
"google-generativeai (>=0.8.4,<0.9.0)",
|
| 14 |
+
"redis (>=5.2.1,<6.0.0)",
|
| 15 |
+
"python-dotenv (>=1.0.1,<2.0.0)",
|
| 16 |
+
"requests (>=2.32.3,<3.0.0)",
|
| 17 |
+
"pypdf2 (>=3.0.1,<4.0.0)",
|
| 18 |
+
"apscheduler (>=3.11.0,<4.0.0)",
|
| 19 |
+
"python-multipart (>=0.0.20,<0.0.21)",
|
| 20 |
+
"httpx (>=0.28.1,<0.29.0)",
|
| 21 |
+
"markdown (>=3.7,<4.0)"
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
[build-system]
|
| 26 |
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
| 27 |
+
build-backend = "poetry.core.masonry.api"
|
tests/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
```
|
| 2 |
+
These are simple tests for individual components
|
| 3 |
+
```
|
tests/api_test.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import aiohttp
|
| 2 |
+
import asyncio
|
| 3 |
+
import os
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
API_URL = "https://huggingface.co/api/daily_papers"
|
| 7 |
+
PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
|
| 8 |
+
DOWNLOAD_DIR = "papers"
|
| 9 |
+
|
| 10 |
+
async def fetch_papers(session):
|
| 11 |
+
async with session.get(API_URL) as response:
|
| 12 |
+
if response.status == 200:
|
| 13 |
+
return await response.json()
|
| 14 |
+
raise Exception(f"API request failed: {response.status}")
|
| 15 |
+
|
| 16 |
+
async def download_pdf(session, paper_entry):
|
| 17 |
+
try:
|
| 18 |
+
paper_id = paper_entry["paper"]["id"]
|
| 19 |
+
pdf_url = PDF_BASE_URL.format(id=paper_id)
|
| 20 |
+
clean_id = paper_id.replace("/", "_")
|
| 21 |
+
filename = f"{datetime.now().date()}_{clean_id}.pdf"
|
| 22 |
+
filepath = os.path.join(DOWNLOAD_DIR, filename)
|
| 23 |
+
|
| 24 |
+
async with session.get(pdf_url) as response:
|
| 25 |
+
if response.status == 200:
|
| 26 |
+
content = await response.read()
|
| 27 |
+
with open(filepath, "wb") as f:
|
| 28 |
+
f.write(content)
|
| 29 |
+
return (paper_id, True)
|
| 30 |
+
return (paper_id, False)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error downloading {paper_id}: {str(e)}")
|
| 33 |
+
return (paper_id, False)
|
| 34 |
+
|
| 35 |
+
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
async def main():
|
| 38 |
+
async with aiohttp.ClientSession() as session:
|
| 39 |
+
papers = await fetch_papers(session)
|
| 40 |
+
print(f"Found {len(papers)} papers")
|
| 41 |
+
|
| 42 |
+
print(f"\nFound {len(papers)} papers:")
|
| 43 |
+
for i, paper_entry in enumerate(papers, 1):
|
| 44 |
+
paper = paper_entry.get("paper", {})
|
| 45 |
+
print(f"\nPaper {i}:")
|
| 46 |
+
print(f"ID: {paper.get('id')}")
|
| 47 |
+
print(f"Title: {paper.get('title')}")
|
| 48 |
+
print(f"Authors: {', '.join([author.get('name') for author in paper.get('authors', [])])}")
|
| 49 |
+
print(f"Published: {paper.get('publishedAt')}")
|
| 50 |
+
print(f"Summary: {paper.get('summary')[:200]}...")
|
| 51 |
+
print(f"PDF URL: {PDF_BASE_URL.format(id=paper.get('id'))}")
|
| 52 |
+
|
| 53 |
+
tasks = [download_pdf(session, paper) for paper in papers]
|
| 54 |
+
results = await asyncio.gather(*tasks)
|
| 55 |
+
|
| 56 |
+
successful = sum(1 for _, status in results if status)
|
| 57 |
+
print(f"Downloaded {successful}/{len(papers)} papers successfully")
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
asyncio.run(main())
|
tests/pdf_workflow_test.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import google.generativeai as genai
|
| 3 |
+
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
GEMINI_API_KEY = ""
|
| 7 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 8 |
+
|
| 9 |
+
class PaperAnalyzer:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.model = genai.GenerativeModel('gemini-1.5-pro-latest')
|
| 12 |
+
self.safety_settings = {
|
| 13 |
+
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
|
| 14 |
+
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
|
| 15 |
+
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
|
| 16 |
+
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
def analyze_paper(self, pdf_path: str) -> str:
|
| 20 |
+
"""
|
| 21 |
+
Process entire PDF with images using Gemini's native PDF handling
|
| 22 |
+
Returns detailed technical analysis including visual elements
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
abs_path = Path(pdf_path).absolute()
|
| 26 |
+
print(f"Looking for PDF at: {abs_path}")
|
| 27 |
+
|
| 28 |
+
if not abs_path.exists():
|
| 29 |
+
available_files = list(abs_path.parent.glob('*'))
|
| 30 |
+
print(f"Available files: {available_files}")
|
| 31 |
+
return f"File not found: {abs_path}"
|
| 32 |
+
|
| 33 |
+
uploaded_file = genai.upload_file(str(abs_path))
|
| 34 |
+
|
| 35 |
+
uploaded_file = genai.upload_file(pdf_path)
|
| 36 |
+
|
| 37 |
+
prompt = f"""Analyze this research paper thoroughly, considering both text and visual elements:
|
| 38 |
+
Provide in depth explanation with all core mathematical concepts and intuition behind them.
|
| 39 |
+
|
| 40 |
+
1. Paper Structure Analysis:
|
| 41 |
+
- Identify key sections (Abstract, Methodology, Results, etc.)
|
| 42 |
+
- Map the paper's argument flow
|
| 43 |
+
|
| 44 |
+
2. Technical Content:
|
| 45 |
+
- Explain core innovations with equations/examples
|
| 46 |
+
- Analyze diagrams/figures and their significance
|
| 47 |
+
- Extract key algorithms/pseudocode
|
| 48 |
+
|
| 49 |
+
3. Critical Evaluation:
|
| 50 |
+
- Strengths/weaknesses of methodology
|
| 51 |
+
- Compare with cited works
|
| 52 |
+
- Suggest improvements
|
| 53 |
+
|
| 54 |
+
4. Visual Element Analysis:
|
| 55 |
+
- Describe important figures/diagrams
|
| 56 |
+
- Explain visual data representations
|
| 57 |
+
- Connect images to textual content
|
| 58 |
+
|
| 59 |
+
Format output in Markdown with these sections:
|
| 60 |
+
# Paper Title
|
| 61 |
+
## Core Contribution
|
| 62 |
+
## Technical Breakdown
|
| 63 |
+
## Visual Analysis
|
| 64 |
+
## Critical Assessment
|
| 65 |
+
## Potential Applications
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
response = self.model.generate_content(
|
| 69 |
+
[prompt, uploaded_file],
|
| 70 |
+
safety_settings=self.safety_settings,
|
| 71 |
+
generation_config={"temperature": 0.2}
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
genai.delete_file(uploaded_file.name)
|
| 75 |
+
return response.text
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
return f"Analysis failed: {str(e)}"
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
analyzer = PaperAnalyzer()
|
| 82 |
+
|
| 83 |
+
paper_path = r"papers/test_pdf.pdf"
|
| 84 |
+
|
| 85 |
+
print(f"Current working directory: {os.getcwd()}")
|
| 86 |
+
print(f"Path exists: {Path(paper_path).exists()}")
|
| 87 |
+
|
| 88 |
+
analysis = analyzer.analyze_paper(paper_path)
|
| 89 |
+
print(analysis)
|
| 90 |
+
|
| 91 |
+
with open("full_analysis.md", "w") as f:
|
| 92 |
+
f.write(analysis)
|