Spaces:
Running
on
Zero
Running
on
Zero
| import sys | |
| import os | |
| os.environ["OMP_NUM_THREADS"] = "4" | |
| DOCLING_DEVICE = os.environ.get("DOCLING_DEVICE", "cpu").lower() | |
| import re | |
| import json | |
| import time | |
| import random | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import urllib.request | |
| import urllib.parse | |
| import threading | |
| from arxiv import _fetch_metadata_by_id | |
| try: | |
| if hasattr(sys.stdout, "reconfigure"): | |
| sys.stdout.reconfigure(encoding="utf-8") | |
| except Exception: | |
| pass | |
| ARXIV_DIRECT_OPENER = urllib.request.build_opener() | |
| ARXIV_HOSTS = {"arxiv.org", "www.arxiv.org", "export.arxiv.org"} | |
| PDF_CONVERT_LOCK = threading.Lock() | |
| from pathlib import Path | |
| import shutil | |
| _docling_converter = None | |
| _docling_current_device = None | |
| def _read_text(file_path: str) -> str: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| def _fix_json_escapes(json_str: str) -> str: | |
| """Fix unescaped backslashes in JSON strings (e.g., backslashes in LaTeX formulas)""" | |
| json_str = json_str.replace('\\\\', '\x00ESCAPED_BACKSLASH\x00') | |
| json_str = json_str.replace('\\n', '\x00ESCAPED_N\x00') | |
| json_str = json_str.replace('\\t', '\x00ESCAPED_T\x00') | |
| json_str = json_str.replace('\\"', '\x00ESCAPED_QUOTE\x00') | |
| json_str = json_str.replace('\\/', '\x00ESCAPED_SLASH\x00') | |
| json_str = json_str.replace('\\r', '\x00ESCAPED_R\x00') | |
| json_str = json_str.replace('\\b', '\x00ESCAPED_B\x00') | |
| json_str = json_str.replace('\\f', '\x00ESCAPED_F\x00') | |
| # Escape remaining single backslashes | |
| json_str = json_str.replace('\\', '\\\\') | |
| # Restore protected content | |
| json_str = json_str.replace('\x00ESCAPED_BACKSLASH\x00', '\\\\') | |
| json_str = json_str.replace('\x00ESCAPED_N\x00', '\\n') | |
| json_str = json_str.replace('\x00ESCAPED_T\x00', '\\t') | |
| json_str = json_str.replace('\x00ESCAPED_QUOTE\x00', '\\"') | |
| json_str = json_str.replace('\x00ESCAPED_SLASH\x00', '\\/') | |
| json_str = json_str.replace('\x00ESCAPED_R\x00', '\\r') | |
| json_str = json_str.replace('\x00ESCAPED_B\x00', '\\b') | |
| json_str = json_str.replace('\x00ESCAPED_F\x00', '\\f') | |
| return json_str | |
| def pdf_to_md(pdf_path: str, output_path: str) -> str | None: | |
| """Convert PDF to Markdown. | |
| Uses a global lock to protect docling calls, ensuring only one PDF is converted at a time. | |
| Returns the generated file path, or None on failure. | |
| Note: docling is imported lazily to avoid triggering CUDA initialization errors | |
| in HF Spaces Stateless GPU environment. | |
| """ | |
| global _docling_converter | |
| try: | |
| paths = Path(pdf_path) | |
| if not os.path.exists(output_path): | |
| os.makedirs(output_path) | |
| print(f"[DEBUG] Preparing to convert PDF: {pdf_path}") | |
| print(f"[DEBUG] Waiting to acquire docling conversion lock...") | |
| with PDF_CONVERT_LOCK: | |
| print(f"[DEBUG] Lock acquired, starting docling conversion...") | |
| if _docling_converter is None: | |
| device_str = DOCLING_DEVICE | |
| print(f"[DEBUG] First use, importing and initializing docling DocumentConverter ({device_str.upper()} mode)...") | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions | |
| from docling.datamodel.base_models import InputFormat | |
| try: | |
| from docling.datamodel.pipeline_options import AcceleratorDevice | |
| if device_str == "cuda": | |
| accelerator_device = AcceleratorDevice.CUDA | |
| else: | |
| accelerator_device = AcceleratorDevice.CPU | |
| except ImportError: | |
| accelerator_device = device_str | |
| pipeline_options = PdfPipelineOptions() | |
| pipeline_options.accelerator_options.device = accelerator_device | |
| _docling_converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) | |
| } | |
| ) | |
| print(f"[DEBUG] docling DocumentConverter initialization complete ({device_str.upper()} mode)") | |
| converter = _docling_converter | |
| print(f"[DEBUG] Calling docling converter.convert()...") | |
| raw_result = converter.convert(pdf_path) | |
| if hasattr(raw_result, 'document'): | |
| md_content = raw_result.document.export_to_markdown() | |
| else: | |
| md_content = raw_result.export_to_markdown() | |
| print(f"[DEBUG] docling conversion complete, releasing lock") | |
| target_md = os.path.join(output_path, paths.stem + ".md") | |
| with open(target_md, 'w', encoding='utf-8') as f: | |
| f.write(md_content) | |
| print(f"[SUCCESS] Markdown file saved to: {target_md}") | |
| print(f"[DEBUG] Markdown file size: {len(md_content)} characters") | |
| return target_md | |
| except Exception as e: | |
| print(f"[ERROR] pdf_to_md failed: {type(e).__name__}: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| def _safe_filename(name: str) -> str: | |
| return re.sub(r'[<>:"/\\|?*]+', '_', name)[:100] | |
| def download_pdf_and_convert_md(paper: dict, output_dir: str) -> str | None: | |
| """Download PDF and convert to Markdown. | |
| If download or conversion fails, creates a file containing paper metadata. | |
| Returns the file path. | |
| Args: | |
| paper: Dictionary containing paper metadata (title, arxiv_id, pdf_url, etc.) | |
| output_dir: Directory to save downloaded PDFs and converted Markdown files | |
| """ | |
| papers_dir = output_dir | |
| if not os.path.exists(papers_dir): | |
| os.makedirs(papers_dir) | |
| def create_fallback_markdown_file(paper: dict, safe_name: str, error_msg: str = "") -> str: | |
| """Create a Markdown file containing basic paper information""" | |
| title = paper.get('title', 'Unknown Paper') | |
| abstract = paper.get('abstract', 'No abstract available') | |
| arxiv_id = paper.get('arxiv_id', 'N/A') | |
| pdf_url = paper.get('pdf_url', '') | |
| abs_url = paper.get('abs_url', '') | |
| authors = paper.get('authors', []) | |
| authors_str = ', '.join(authors) if authors else 'Unknown' | |
| md_content = f"""# {title} | |
| **arXiv ID**: {arxiv_id} | |
| **Authors**: {authors_str} | |
| **PDF URL**: {pdf_url} | |
| **Abstract URL**: {abs_url} | |
| --- | |
| **Note**: PDF download or conversion failed. Only metadata is available. | |
| {f"**Error**: {error_msg}" if error_msg else ""} | |
| --- | |
| ## Abstract | |
| {abstract} | |
| --- | |
| **Full text is not available. Please refer to the original paper via the URLs above.** | |
| """ | |
| md_path = os.path.join(papers_dir, f"{safe_name}.md") | |
| try: | |
| with open(md_path, 'w', encoding='utf-8') as f: | |
| f.write(md_content) | |
| print(f"[INFO] Created fallback Markdown file (metadata only): {md_path}") | |
| return md_path | |
| except Exception as e: | |
| print(f"[ERROR] Failed to create fallback Markdown: {e}") | |
| return None | |
| try: | |
| print(f"[DEBUG] Starting paper download: {paper.get('title', 'Unknown')[:60]}...") | |
| print(f"[DEBUG] arXiv ID: {paper.get('arxiv_id', 'N/A')}") | |
| title = paper.get('title') or paper.get('arxiv_id') or 'paper' | |
| arxiv_id = paper.get('arxiv_id') or '' | |
| base_name = f"{arxiv_id}_{title[:50]}" if arxiv_id else title[:50] | |
| safe = _safe_filename(base_name) | |
| pdf_url = paper.get('pdf_url') or '' | |
| if not pdf_url: | |
| abs_url = paper.get('abs_url') or '' | |
| if abs_url: | |
| pdf_url = abs_url.replace('/abs/', '/pdf/') | |
| if pdf_url and not pdf_url.endswith('.pdf'): | |
| pdf_url = pdf_url + '.pdf' | |
| if not pdf_url: | |
| print(f"[WARNING] Unable to get PDF URL, creating fallback Markdown") | |
| return create_fallback_markdown_file(paper, safe, "Unable to get PDF URL") | |
| print(f"[DEBUG] PDF URL: {pdf_url}") | |
| pdf_path = os.path.join(papers_dir, f"{safe}.pdf") | |
| try: | |
| if not os.path.exists(pdf_path): | |
| print(f"[DEBUG] Starting PDF download to: {pdf_path}") | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1' | |
| } | |
| parsed = urllib.parse.urlparse(pdf_url) | |
| host = (parsed.hostname or '').lower() | |
| use_direct = (host in ARXIV_HOSTS) or any(host.endswith('.' + h) for h in ARXIV_HOSTS) | |
| print(f"[DEBUG] Using proxy mode: {'DIRECT_OPENER' if use_direct else 'urlopen (with proxy)'}") | |
| opener_open = ARXIV_DIRECT_OPENER.open if use_direct else urllib.request.urlopen | |
| max_retries = 3 | |
| retry_delay = 2 | |
| for attempt in range(max_retries): | |
| try: | |
| if attempt > 0: | |
| wait_time = retry_delay * (attempt + 1) + random.uniform(0.5, 2.0) | |
| print(f"[INFO] Waiting {wait_time:.1f} seconds before retry (attempt {attempt + 1}/{max_retries})...") | |
| time.sleep(wait_time) | |
| print(f"[DEBUG] Sending HTTP request (attempt {attempt + 1}/{max_retries})...") | |
| req = urllib.request.Request(pdf_url, headers=headers) | |
| with opener_open(req, timeout=60) as resp: | |
| pdf_data = resp.read() | |
| print(f"[DEBUG] Download complete, size: {len(pdf_data)} bytes") | |
| if len(pdf_data) < 100: | |
| print(f"[WARNING] Downloaded file is too small ({len(pdf_data)} bytes), may not be a valid PDF") | |
| if attempt < max_retries - 1: | |
| continue | |
| raise ValueError(f"Downloaded file is too small: {len(pdf_data)} bytes") | |
| if not pdf_data.startswith(b'%PDF-'): | |
| print(f"[WARNING] Downloaded file is not a valid PDF (header: {pdf_data[:20]})") | |
| if attempt < max_retries - 1: | |
| continue | |
| raise ValueError("Downloaded file is not a valid PDF format") | |
| with open(pdf_path, 'wb') as f: | |
| f.write(pdf_data) | |
| print(f"[SUCCESS] PDF download successful: {pdf_path}") | |
| break | |
| except urllib.error.HTTPError as e: | |
| error_msg = f"HTTP Error {e.code}: {e.reason}" | |
| print(f"[WARNING] {error_msg}") | |
| if e.code == 403: | |
| print(f"[TIP] 403 error usually indicates:") | |
| print(f" 1. Request identified as bot (using real browser User-Agent)") | |
| print(f" 2. IP temporarily rate-limited (retrying...)") | |
| print(f" 3. Longer request interval needed") | |
| if attempt == max_retries - 1: | |
| raise | |
| except Exception as e: | |
| print(f"[WARNING] Download exception: {type(e).__name__}: {e}") | |
| if attempt == max_retries - 1: | |
| raise | |
| time.sleep(random.uniform(1.0, 3.0)) | |
| else: | |
| print(f"[DEBUG] PDF already exists, skipping download: {pdf_path}") | |
| except Exception as e: | |
| print(f"[ERROR] PDF download failed (all retries exhausted): {type(e).__name__}: {e}") | |
| print(f"[INFO] Creating fallback Markdown (metadata only)") | |
| return create_fallback_markdown_file(paper, safe, f"PDF download failed: {str(e)}") | |
| print(f"[DEBUG] Starting PDF to Markdown conversion...") | |
| md_path = pdf_to_md(pdf_path, papers_dir) | |
| if md_path and os.path.isfile(md_path): | |
| print(f"[SUCCESS] Markdown conversion successful: {md_path}") | |
| return md_path | |
| else: | |
| print(f"[WARNING] Markdown conversion failed, creating fallback Markdown") | |
| return create_fallback_markdown_file(paper, safe, "PDF conversion failed") | |
| except Exception as e: | |
| print(f"[WARNING] download_pdf_and_convert_md exception: {type(e).__name__}: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| try: | |
| title = paper.get('title') or paper.get('arxiv_id') or 'paper' | |
| arxiv_id = paper.get('arxiv_id') or '' | |
| base_name = f"{arxiv_id}_{title[:50]}" if arxiv_id else title[:50] | |
| safe = _safe_filename(base_name) | |
| return create_fallback_markdown_file(paper, safe, f"Processing exception: {str(e)}") | |
| except: | |
| return None | |
| _CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| PROMPTS_DIR = os.path.join(_CURRENT_DIR, "prompts") | |
| import yaml | |
| # Mapping from old prompt names to new YAML names | |
| PROMPT_NAME_MAPPING = { | |
| "1.txt": "semantic_encoder.yaml", | |
| "2.txt": "issue_extractor.yaml", | |
| "2_c.txt": "issue_extractor_checker.yaml", | |
| "3.txt": "literature_retrieval.yaml", | |
| "4.txt": "reference_filter.yaml", | |
| "5.txt": "reference_analyzer.yaml", | |
| "6.txt": "strategy_generator.yaml", | |
| "7.txt": "strategy_reviewer.yaml", | |
| "7_h.txt": "strategy_human_refinement.yaml", | |
| "8.txt": "rebuttal_writer.yaml", | |
| "9.txt": "rebuttal_reviewer.yaml", | |
| } | |
| def load_prompt(name: str) -> str: | |
| """Load prompt from YAML or TXT file. | |
| Supports both new YAML format and legacy TXT format. | |
| For YAML files, extracts the 'prompt' field. | |
| """ | |
| # Map old names to new names | |
| mapped_name = PROMPT_NAME_MAPPING.get(name, name) | |
| prompt_path = os.path.join(PROMPTS_DIR, mapped_name) | |
| # Try YAML first, then fall back to original name | |
| if not os.path.exists(prompt_path): | |
| prompt_path = os.path.join(PROMPTS_DIR, name) | |
| if not os.path.exists(prompt_path): | |
| raise FileNotFoundError(f"Prompt file not found: {prompt_path}") | |
| with open(prompt_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| # If it's a YAML file, extract the prompt field | |
| if prompt_path.endswith('.yaml') or prompt_path.endswith('.yml'): | |
| try: | |
| data = yaml.safe_load(content) | |
| if isinstance(data, dict) and 'prompt' in data: | |
| return data['prompt'] | |
| return content | |
| except yaml.YAMLError: | |
| return content | |
| return content |