Spaces:
Running on Zero
Running on Zero
File size: 15,792 Bytes
3b58a0b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | import sys
import os
os.environ["OMP_NUM_THREADS"] = "4"
DOCLING_DEVICE = os.environ.get("DOCLING_DEVICE", "cpu").lower()
import re
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import urllib.request
import urllib.parse
import threading
from arxiv import _fetch_metadata_by_id
try:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
ARXIV_DIRECT_OPENER = urllib.request.build_opener()
ARXIV_HOSTS = {"arxiv.org", "www.arxiv.org", "export.arxiv.org"}
PDF_CONVERT_LOCK = threading.Lock()
from pathlib import Path
import shutil
_docling_converter = None
_docling_current_device = None
def _read_text(file_path: str) -> str:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def _fix_json_escapes(json_str: str) -> str:
"""Fix unescaped backslashes in JSON strings (e.g., backslashes in LaTeX formulas)"""
json_str = json_str.replace('\\\\', '\x00ESCAPED_BACKSLASH\x00')
json_str = json_str.replace('\\n', '\x00ESCAPED_N\x00')
json_str = json_str.replace('\\t', '\x00ESCAPED_T\x00')
json_str = json_str.replace('\\"', '\x00ESCAPED_QUOTE\x00')
json_str = json_str.replace('\\/', '\x00ESCAPED_SLASH\x00')
json_str = json_str.replace('\\r', '\x00ESCAPED_R\x00')
json_str = json_str.replace('\\b', '\x00ESCAPED_B\x00')
json_str = json_str.replace('\\f', '\x00ESCAPED_F\x00')
# Escape remaining single backslashes
json_str = json_str.replace('\\', '\\\\')
# Restore protected content
json_str = json_str.replace('\x00ESCAPED_BACKSLASH\x00', '\\\\')
json_str = json_str.replace('\x00ESCAPED_N\x00', '\\n')
json_str = json_str.replace('\x00ESCAPED_T\x00', '\\t')
json_str = json_str.replace('\x00ESCAPED_QUOTE\x00', '\\"')
json_str = json_str.replace('\x00ESCAPED_SLASH\x00', '\\/')
json_str = json_str.replace('\x00ESCAPED_R\x00', '\\r')
json_str = json_str.replace('\x00ESCAPED_B\x00', '\\b')
json_str = json_str.replace('\x00ESCAPED_F\x00', '\\f')
return json_str
def pdf_to_md(pdf_path: str, output_path: str) -> str | None:
"""Convert PDF to Markdown.
Uses a global lock to protect docling calls, ensuring only one PDF is converted at a time.
Returns the generated file path, or None on failure.
Note: docling is imported lazily to avoid triggering CUDA initialization errors
in HF Spaces Stateless GPU environment.
"""
global _docling_converter
try:
paths = Path(pdf_path)
if not os.path.exists(output_path):
os.makedirs(output_path)
print(f"[DEBUG] Preparing to convert PDF: {pdf_path}")
print(f"[DEBUG] Waiting to acquire docling conversion lock...")
with PDF_CONVERT_LOCK:
print(f"[DEBUG] Lock acquired, starting docling conversion...")
if _docling_converter is None:
device_str = DOCLING_DEVICE
print(f"[DEBUG] First use, importing and initializing docling DocumentConverter ({device_str.upper()} mode)...")
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
try:
from docling.datamodel.pipeline_options import AcceleratorDevice
if device_str == "cuda":
accelerator_device = AcceleratorDevice.CUDA
else:
accelerator_device = AcceleratorDevice.CPU
except ImportError:
accelerator_device = device_str
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options.device = accelerator_device
_docling_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
print(f"[DEBUG] docling DocumentConverter initialization complete ({device_str.upper()} mode)")
converter = _docling_converter
print(f"[DEBUG] Calling docling converter.convert()...")
raw_result = converter.convert(pdf_path)
if hasattr(raw_result, 'document'):
md_content = raw_result.document.export_to_markdown()
else:
md_content = raw_result.export_to_markdown()
print(f"[DEBUG] docling conversion complete, releasing lock")
target_md = os.path.join(output_path, paths.stem + ".md")
with open(target_md, 'w', encoding='utf-8') as f:
f.write(md_content)
print(f"[SUCCESS] Markdown file saved to: {target_md}")
print(f"[DEBUG] Markdown file size: {len(md_content)} characters")
return target_md
except Exception as e:
print(f"[ERROR] pdf_to_md failed: {type(e).__name__}: {e}")
import traceback
traceback.print_exc()
return None
def _safe_filename(name: str) -> str:
return re.sub(r'[<>:"/\\|?*]+', '_', name)[:100]
def download_pdf_and_convert_md(paper: dict, output_dir: str) -> str | None:
"""Download PDF and convert to Markdown.
If download or conversion fails, creates a file containing paper metadata.
Returns the file path.
Args:
paper: Dictionary containing paper metadata (title, arxiv_id, pdf_url, etc.)
output_dir: Directory to save downloaded PDFs and converted Markdown files
"""
papers_dir = output_dir
if not os.path.exists(papers_dir):
os.makedirs(papers_dir)
def create_fallback_markdown_file(paper: dict, safe_name: str, error_msg: str = "") -> str:
"""Create a Markdown file containing basic paper information"""
title = paper.get('title', 'Unknown Paper')
abstract = paper.get('abstract', 'No abstract available')
arxiv_id = paper.get('arxiv_id', 'N/A')
pdf_url = paper.get('pdf_url', '')
abs_url = paper.get('abs_url', '')
authors = paper.get('authors', [])
authors_str = ', '.join(authors) if authors else 'Unknown'
md_content = f"""# {title}
**arXiv ID**: {arxiv_id}
**Authors**: {authors_str}
**PDF URL**: {pdf_url}
**Abstract URL**: {abs_url}
---
**Note**: PDF download or conversion failed. Only metadata is available.
{f"**Error**: {error_msg}" if error_msg else ""}
---
## Abstract
{abstract}
---
**Full text is not available. Please refer to the original paper via the URLs above.**
"""
md_path = os.path.join(papers_dir, f"{safe_name}.md")
try:
with open(md_path, 'w', encoding='utf-8') as f:
f.write(md_content)
print(f"[INFO] Created fallback Markdown file (metadata only): {md_path}")
return md_path
except Exception as e:
print(f"[ERROR] Failed to create fallback Markdown: {e}")
return None
try:
print(f"[DEBUG] Starting paper download: {paper.get('title', 'Unknown')[:60]}...")
print(f"[DEBUG] arXiv ID: {paper.get('arxiv_id', 'N/A')}")
title = paper.get('title') or paper.get('arxiv_id') or 'paper'
arxiv_id = paper.get('arxiv_id') or ''
base_name = f"{arxiv_id}_{title[:50]}" if arxiv_id else title[:50]
safe = _safe_filename(base_name)
pdf_url = paper.get('pdf_url') or ''
if not pdf_url:
abs_url = paper.get('abs_url') or ''
if abs_url:
pdf_url = abs_url.replace('/abs/', '/pdf/')
if pdf_url and not pdf_url.endswith('.pdf'):
pdf_url = pdf_url + '.pdf'
if not pdf_url:
print(f"[WARNING] Unable to get PDF URL, creating fallback Markdown")
return create_fallback_markdown_file(paper, safe, "Unable to get PDF URL")
print(f"[DEBUG] PDF URL: {pdf_url}")
pdf_path = os.path.join(papers_dir, f"{safe}.pdf")
try:
if not os.path.exists(pdf_path):
print(f"[DEBUG] Starting PDF download to: {pdf_path}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
parsed = urllib.parse.urlparse(pdf_url)
host = (parsed.hostname or '').lower()
use_direct = (host in ARXIV_HOSTS) or any(host.endswith('.' + h) for h in ARXIV_HOSTS)
print(f"[DEBUG] Using proxy mode: {'DIRECT_OPENER' if use_direct else 'urlopen (with proxy)'}")
opener_open = ARXIV_DIRECT_OPENER.open if use_direct else urllib.request.urlopen
max_retries = 3
retry_delay = 2
for attempt in range(max_retries):
try:
if attempt > 0:
wait_time = retry_delay * (attempt + 1) + random.uniform(0.5, 2.0)
print(f"[INFO] Waiting {wait_time:.1f} seconds before retry (attempt {attempt + 1}/{max_retries})...")
time.sleep(wait_time)
print(f"[DEBUG] Sending HTTP request (attempt {attempt + 1}/{max_retries})...")
req = urllib.request.Request(pdf_url, headers=headers)
with opener_open(req, timeout=60) as resp:
pdf_data = resp.read()
print(f"[DEBUG] Download complete, size: {len(pdf_data)} bytes")
if len(pdf_data) < 100:
print(f"[WARNING] Downloaded file is too small ({len(pdf_data)} bytes), may not be a valid PDF")
if attempt < max_retries - 1:
continue
raise ValueError(f"Downloaded file is too small: {len(pdf_data)} bytes")
if not pdf_data.startswith(b'%PDF-'):
print(f"[WARNING] Downloaded file is not a valid PDF (header: {pdf_data[:20]})")
if attempt < max_retries - 1:
continue
raise ValueError("Downloaded file is not a valid PDF format")
with open(pdf_path, 'wb') as f:
f.write(pdf_data)
print(f"[SUCCESS] PDF download successful: {pdf_path}")
break
except urllib.error.HTTPError as e:
error_msg = f"HTTP Error {e.code}: {e.reason}"
print(f"[WARNING] {error_msg}")
if e.code == 403:
print(f"[TIP] 403 error usually indicates:")
print(f" 1. Request identified as bot (using real browser User-Agent)")
print(f" 2. IP temporarily rate-limited (retrying...)")
print(f" 3. Longer request interval needed")
if attempt == max_retries - 1:
raise
except Exception as e:
print(f"[WARNING] Download exception: {type(e).__name__}: {e}")
if attempt == max_retries - 1:
raise
time.sleep(random.uniform(1.0, 3.0))
else:
print(f"[DEBUG] PDF already exists, skipping download: {pdf_path}")
except Exception as e:
print(f"[ERROR] PDF download failed (all retries exhausted): {type(e).__name__}: {e}")
print(f"[INFO] Creating fallback Markdown (metadata only)")
return create_fallback_markdown_file(paper, safe, f"PDF download failed: {str(e)}")
print(f"[DEBUG] Starting PDF to Markdown conversion...")
md_path = pdf_to_md(pdf_path, papers_dir)
if md_path and os.path.isfile(md_path):
print(f"[SUCCESS] Markdown conversion successful: {md_path}")
return md_path
else:
print(f"[WARNING] Markdown conversion failed, creating fallback Markdown")
return create_fallback_markdown_file(paper, safe, "PDF conversion failed")
except Exception as e:
print(f"[WARNING] download_pdf_and_convert_md exception: {type(e).__name__}: {e}")
import traceback
traceback.print_exc()
try:
title = paper.get('title') or paper.get('arxiv_id') or 'paper'
arxiv_id = paper.get('arxiv_id') or ''
base_name = f"{arxiv_id}_{title[:50]}" if arxiv_id else title[:50]
safe = _safe_filename(base_name)
return create_fallback_markdown_file(paper, safe, f"Processing exception: {str(e)}")
except:
return None
_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
PROMPTS_DIR = os.path.join(_CURRENT_DIR, "prompts")
import yaml
# Mapping from old prompt names to new YAML names
PROMPT_NAME_MAPPING = {
"1.txt": "semantic_encoder.yaml",
"2.txt": "issue_extractor.yaml",
"2_c.txt": "issue_extractor_checker.yaml",
"3.txt": "literature_retrieval.yaml",
"4.txt": "reference_filter.yaml",
"5.txt": "reference_analyzer.yaml",
"6.txt": "strategy_generator.yaml",
"7.txt": "strategy_reviewer.yaml",
"7_h.txt": "strategy_human_refinement.yaml",
"8.txt": "rebuttal_writer.yaml",
"9.txt": "rebuttal_reviewer.yaml",
}
def load_prompt(name: str) -> str:
"""Load prompt from YAML or TXT file.
Supports both new YAML format and legacy TXT format.
For YAML files, extracts the 'prompt' field.
"""
# Map old names to new names
mapped_name = PROMPT_NAME_MAPPING.get(name, name)
prompt_path = os.path.join(PROMPTS_DIR, mapped_name)
# Try YAML first, then fall back to original name
if not os.path.exists(prompt_path):
prompt_path = os.path.join(PROMPTS_DIR, name)
if not os.path.exists(prompt_path):
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
with open(prompt_path, "r", encoding="utf-8") as f:
content = f.read()
# If it's a YAML file, extract the prompt field
if prompt_path.endswith('.yaml') or prompt_path.endswith('.yml'):
try:
data = yaml.safe_load(content)
if isinstance(data, dict) and 'prompt' in data:
return data['prompt']
return content
except yaml.YAMLError:
return content
return content |