smokxy commited on
Commit
1d07116
·
1 Parent(s): 18b9802

Intial design and tests

Browse files
.env ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face
2
+ HF_API_KEY=your_huggingface_key
3
+
4
+ # Gemini
5
+ GEMINI_API_KEY=your_gemini_key
6
+
7
+ # Redis
8
+ REDIS_HOST=localhost
9
+ REDIS_PORT=6379
10
+ REDIS_DB=0
11
+
12
+ # App Config
13
+ DAILY_PAPER_LIMIT=
14
+ CACHE_TTL=86400
InitialDesign.png ADDED
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```
2
+ paperflux/
3
+ ├── .env.example
4
+ ├── pyproject.toml
5
+ ├── poetry.lock
6
+ ├── README.md
7
+ ├── .gitignore
8
+ ├── src/
9
+ │ ├── __init__.py
10
+ │ ├── tools/
11
+ │ │ ├── __init__.py
12
+ │ │ ├── hf_tools/
13
+ │ │ │ ├── __init__.py
14
+ │ │ │ ├── paper_pdf_tool.py
15
+ │ │ │ └── summarization_tool.py
16
+ │ │ ├── cache/
17
+ │ │ │ ├── __init__.py
18
+ │ │ │ ├── redis_client.py # Core Redis operations
19
+ │ │ │ └── cache_interface.py # Abstract base class
20
+ │ │ └── cache_manager.py # High-level cache operations
21
+ │ ├── agents/
22
+ │ │ ├── __init__.py
23
+ │ │ └── agent.py
24
+ │ ├── models/
25
+ │ │ ├── __init__.py
26
+ │ │ └── model.py # Pydantic models for data validation
27
+ │ │── scheduler.py # Scheduled cache updates
28
+ | └── app.py # gradio web app
29
+ ```
30
+
31
+ ``` Above is agentic workflow design, initial workflow will be using gemini api key and will be extended to agentic system ```
paperflux/__init__.py ADDED
File without changes
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "paperflux"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = [
6
+ {name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "gradio (>=5.16.0,<6.0.0)",
13
+ "google-generativeai (>=0.8.4,<0.9.0)",
14
+ "redis (>=5.2.1,<6.0.0)",
15
+ "python-dotenv (>=1.0.1,<2.0.0)",
16
+ "requests (>=2.32.3,<3.0.0)",
17
+ "pypdf2 (>=3.0.1,<4.0.0)",
18
+ "apscheduler (>=3.11.0,<4.0.0)",
19
+ "python-multipart (>=0.0.20,<0.0.21)",
20
+ "httpx (>=0.28.1,<0.29.0)",
21
+ "markdown (>=3.7,<4.0)"
22
+ ]
23
+
24
+
25
+ [build-system]
26
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
27
+ build-backend = "poetry.core.masonry.api"
tests/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ```
2
+ These are simple tests for individual components
3
+ ```
tests/api_test.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+ import asyncio
3
+ import os
4
+ from datetime import datetime
5
+
6
+ API_URL = "https://huggingface.co/api/daily_papers"
7
+ PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
8
+ DOWNLOAD_DIR = "papers"
9
+
10
+ async def fetch_papers(session):
11
+ async with session.get(API_URL) as response:
12
+ if response.status == 200:
13
+ return await response.json()
14
+ raise Exception(f"API request failed: {response.status}")
15
+
16
+ async def download_pdf(session, paper_entry):
17
+ try:
18
+ paper_id = paper_entry["paper"]["id"]
19
+ pdf_url = PDF_BASE_URL.format(id=paper_id)
20
+ clean_id = paper_id.replace("/", "_")
21
+ filename = f"{datetime.now().date()}_{clean_id}.pdf"
22
+ filepath = os.path.join(DOWNLOAD_DIR, filename)
23
+
24
+ async with session.get(pdf_url) as response:
25
+ if response.status == 200:
26
+ content = await response.read()
27
+ with open(filepath, "wb") as f:
28
+ f.write(content)
29
+ return (paper_id, True)
30
+ return (paper_id, False)
31
+ except Exception as e:
32
+ print(f"Error downloading {paper_id}: {str(e)}")
33
+ return (paper_id, False)
34
+
35
+ os.makedirs(DOWNLOAD_DIR, exist_ok=True)
36
+
37
+ async def main():
38
+ async with aiohttp.ClientSession() as session:
39
+ papers = await fetch_papers(session)
40
+ print(f"Found {len(papers)} papers")
41
+
42
+ print(f"\nFound {len(papers)} papers:")
43
+ for i, paper_entry in enumerate(papers, 1):
44
+ paper = paper_entry.get("paper", {})
45
+ print(f"\nPaper {i}:")
46
+ print(f"ID: {paper.get('id')}")
47
+ print(f"Title: {paper.get('title')}")
48
+ print(f"Authors: {', '.join([author.get('name') for author in paper.get('authors', [])])}")
49
+ print(f"Published: {paper.get('publishedAt')}")
50
+ print(f"Summary: {paper.get('summary')[:200]}...")
51
+ print(f"PDF URL: {PDF_BASE_URL.format(id=paper.get('id'))}")
52
+
53
+ tasks = [download_pdf(session, paper) for paper in papers]
54
+ results = await asyncio.gather(*tasks)
55
+
56
+ successful = sum(1 for _, status in results if status)
57
+ print(f"Downloaded {successful}/{len(papers)} papers successfully")
58
+
59
+ if __name__ == "__main__":
60
+ asyncio.run(main())
tests/pdf_workflow_test.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import google.generativeai as genai
3
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
4
+ from pathlib import Path
5
+
6
+ GEMINI_API_KEY = ""
7
+ genai.configure(api_key=GEMINI_API_KEY)
8
+
9
+ class PaperAnalyzer:
10
+ def __init__(self):
11
+ self.model = genai.GenerativeModel('gemini-1.5-pro-latest')
12
+ self.safety_settings = {
13
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
14
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
15
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
16
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
17
+ }
18
+
19
+ def analyze_paper(self, pdf_path: str) -> str:
20
+ """
21
+ Process entire PDF with images using Gemini's native PDF handling
22
+ Returns detailed technical analysis including visual elements
23
+ """
24
+ try:
25
+ abs_path = Path(pdf_path).absolute()
26
+ print(f"Looking for PDF at: {abs_path}")
27
+
28
+ if not abs_path.exists():
29
+ available_files = list(abs_path.parent.glob('*'))
30
+ print(f"Available files: {available_files}")
31
+ return f"File not found: {abs_path}"
32
+
33
+ uploaded_file = genai.upload_file(str(abs_path))
34
+
35
+ uploaded_file = genai.upload_file(pdf_path)
36
+
37
+ prompt = f"""Analyze this research paper thoroughly, considering both text and visual elements:
38
+ Provide in depth explanation with all core mathematical concepts and intuition behind them.
39
+
40
+ 1. Paper Structure Analysis:
41
+ - Identify key sections (Abstract, Methodology, Results, etc.)
42
+ - Map the paper's argument flow
43
+
44
+ 2. Technical Content:
45
+ - Explain core innovations with equations/examples
46
+ - Analyze diagrams/figures and their significance
47
+ - Extract key algorithms/pseudocode
48
+
49
+ 3. Critical Evaluation:
50
+ - Strengths/weaknesses of methodology
51
+ - Compare with cited works
52
+ - Suggest improvements
53
+
54
+ 4. Visual Element Analysis:
55
+ - Describe important figures/diagrams
56
+ - Explain visual data representations
57
+ - Connect images to textual content
58
+
59
+ Format output in Markdown with these sections:
60
+ # Paper Title
61
+ ## Core Contribution
62
+ ## Technical Breakdown
63
+ ## Visual Analysis
64
+ ## Critical Assessment
65
+ ## Potential Applications
66
+ """
67
+
68
+ response = self.model.generate_content(
69
+ [prompt, uploaded_file],
70
+ safety_settings=self.safety_settings,
71
+ generation_config={"temperature": 0.2}
72
+ )
73
+
74
+ genai.delete_file(uploaded_file.name)
75
+ return response.text
76
+
77
+ except Exception as e:
78
+ return f"Analysis failed: {str(e)}"
79
+
80
+ if __name__ == "__main__":
81
+ analyzer = PaperAnalyzer()
82
+
83
+ paper_path = r"papers/test_pdf.pdf"
84
+
85
+ print(f"Current working directory: {os.getcwd()}")
86
+ print(f"Path exists: {Path(paper_path).exists()}")
87
+
88
+ analysis = analyzer.analyze_paper(paper_path)
89
+ print(analysis)
90
+
91
+ with open("full_analysis.md", "w") as f:
92
+ f.write(analysis)