Spaces:

Mqleet
/

RebuttalAgent

Running on Zero

File size: 15,792 Bytes

3b58a0b

import sys
import os

os.environ["OMP_NUM_THREADS"] = "4"

DOCLING_DEVICE = os.environ.get("DOCLING_DEVICE", "cpu").lower()

import re
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import urllib.request
import urllib.parse
import threading
from arxiv import _fetch_metadata_by_id

try:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8")
except Exception:
    pass

ARXIV_DIRECT_OPENER = urllib.request.build_opener()
ARXIV_HOSTS = {"arxiv.org", "www.arxiv.org", "export.arxiv.org"}
            
PDF_CONVERT_LOCK = threading.Lock()

from pathlib import Path
import shutil

_docling_converter = None
_docling_current_device = None 

def _read_text(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def _fix_json_escapes(json_str: str) -> str:
    """Fix unescaped backslashes in JSON strings (e.g., backslashes in LaTeX formulas)"""
    json_str = json_str.replace('\\\\', '\x00ESCAPED_BACKSLASH\x00')
    json_str = json_str.replace('\\n', '\x00ESCAPED_N\x00')
    json_str = json_str.replace('\\t', '\x00ESCAPED_T\x00')
    json_str = json_str.replace('\\"', '\x00ESCAPED_QUOTE\x00')
    json_str = json_str.replace('\\/', '\x00ESCAPED_SLASH\x00')
    json_str = json_str.replace('\\r', '\x00ESCAPED_R\x00')
    json_str = json_str.replace('\\b', '\x00ESCAPED_B\x00')
    json_str = json_str.replace('\\f', '\x00ESCAPED_F\x00')
    # Escape remaining single backslashes
    json_str = json_str.replace('\\', '\\\\')
    # Restore protected content
    json_str = json_str.replace('\x00ESCAPED_BACKSLASH\x00', '\\\\')
    json_str = json_str.replace('\x00ESCAPED_N\x00', '\\n')
    json_str = json_str.replace('\x00ESCAPED_T\x00', '\\t')
    json_str = json_str.replace('\x00ESCAPED_QUOTE\x00', '\\"')
    json_str = json_str.replace('\x00ESCAPED_SLASH\x00', '\\/')
    json_str = json_str.replace('\x00ESCAPED_R\x00', '\\r')
    json_str = json_str.replace('\x00ESCAPED_B\x00', '\\b')
    json_str = json_str.replace('\x00ESCAPED_F\x00', '\\f')
    return json_str

def pdf_to_md(pdf_path: str, output_path: str) -> str | None:
    """Convert PDF to Markdown.
    
    Uses a global lock to protect docling calls, ensuring only one PDF is converted at a time.
    Returns the generated file path, or None on failure.
    
    Note: docling is imported lazily to avoid triggering CUDA initialization errors
    in HF Spaces Stateless GPU environment.
    """
    global _docling_converter
    
    try:
        paths = Path(pdf_path)
        
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        
        print(f"[DEBUG] Preparing to convert PDF: {pdf_path}")
        print(f"[DEBUG] Waiting to acquire docling conversion lock...")
        with PDF_CONVERT_LOCK:
            print(f"[DEBUG] Lock acquired, starting docling conversion...")
            

            if _docling_converter is None:
                device_str = DOCLING_DEVICE
                print(f"[DEBUG] First use, importing and initializing docling DocumentConverter ({device_str.upper()} mode)...")
                from docling.document_converter import DocumentConverter, PdfFormatOption
                from docling.datamodel.pipeline_options import PdfPipelineOptions
                from docling.datamodel.base_models import InputFormat
                try:
                    from docling.datamodel.pipeline_options import AcceleratorDevice
                    if device_str == "cuda":
                        accelerator_device = AcceleratorDevice.CUDA
                    else:
                        accelerator_device = AcceleratorDevice.CPU
                except ImportError:
                    accelerator_device = device_str
                
                pipeline_options = PdfPipelineOptions()
                pipeline_options.accelerator_options.device = accelerator_device
                
                _docling_converter = DocumentConverter(
                    format_options={
                        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
                    }
                )
                print(f"[DEBUG] docling DocumentConverter initialization complete ({device_str.upper()} mode)")
        
            converter = _docling_converter
        
            print(f"[DEBUG] Calling docling converter.convert()...")
            raw_result = converter.convert(pdf_path)
        
            if hasattr(raw_result, 'document'):
                md_content = raw_result.document.export_to_markdown()
            else:
                md_content = raw_result.export_to_markdown()
            
            print(f"[DEBUG] docling conversion complete, releasing lock")
        
        target_md = os.path.join(output_path, paths.stem + ".md")
        with open(target_md, 'w', encoding='utf-8') as f:
            f.write(md_content)
        
        print(f"[SUCCESS] Markdown file saved to: {target_md}")
        print(f"[DEBUG] Markdown file size: {len(md_content)} characters")
        
        return target_md
        
    except Exception as e:
        print(f"[ERROR] pdf_to_md failed: {type(e).__name__}: {e}")
        import traceback
        traceback.print_exc()
        return None
def _safe_filename(name: str) -> str:
    return re.sub(r'[<>:"/\\|?*]+', '_', name)[:100]

def download_pdf_and_convert_md(paper: dict, output_dir: str) -> str | None:
    """Download PDF and convert to Markdown.
    
    If download or conversion fails, creates a file containing paper metadata.
    Returns the file path.
    
    Args:
        paper: Dictionary containing paper metadata (title, arxiv_id, pdf_url, etc.)
        output_dir: Directory to save downloaded PDFs and converted Markdown files
    """
    papers_dir = output_dir
    
    if not os.path.exists(papers_dir):
        os.makedirs(papers_dir)
    
    def create_fallback_markdown_file(paper: dict, safe_name: str, error_msg: str = "") -> str:
        """Create a Markdown file containing basic paper information"""
        title = paper.get('title', 'Unknown Paper')
        abstract = paper.get('abstract', 'No abstract available')
        arxiv_id = paper.get('arxiv_id', 'N/A')
        pdf_url = paper.get('pdf_url', '')
        abs_url = paper.get('abs_url', '')
        authors = paper.get('authors', [])
        
        authors_str = ', '.join(authors) if authors else 'Unknown'
        
        md_content = f"""# {title}

**arXiv ID**: {arxiv_id}  
**Authors**: {authors_str}  
**PDF URL**: {pdf_url}  
**Abstract URL**: {abs_url}  

---

**Note**: PDF download or conversion failed. Only metadata is available.
{f"**Error**: {error_msg}" if error_msg else ""}

---

## Abstract

{abstract}

---

**Full text is not available. Please refer to the original paper via the URLs above.**
"""
        
        md_path = os.path.join(papers_dir, f"{safe_name}.md")
        try:
            with open(md_path, 'w', encoding='utf-8') as f:
                f.write(md_content)
            print(f"[INFO] Created fallback Markdown file (metadata only): {md_path}")
            return md_path
        except Exception as e:
            print(f"[ERROR] Failed to create fallback Markdown: {e}")
            return None
    
    try:
        print(f"[DEBUG] Starting paper download: {paper.get('title', 'Unknown')[:60]}...")
        print(f"[DEBUG] arXiv ID: {paper.get('arxiv_id', 'N/A')}")
        
        title = paper.get('title') or paper.get('arxiv_id') or 'paper'
        arxiv_id = paper.get('arxiv_id') or ''
        base_name = f"{arxiv_id}_{title[:50]}" if arxiv_id else title[:50]
        safe = _safe_filename(base_name)
        
        pdf_url = paper.get('pdf_url') or ''
        if not pdf_url:
            abs_url = paper.get('abs_url') or ''
            if abs_url:
                pdf_url = abs_url.replace('/abs/', '/pdf/')
        if pdf_url and not pdf_url.endswith('.pdf'):
            pdf_url = pdf_url + '.pdf'
        
        if not pdf_url:
            print(f"[WARNING] Unable to get PDF URL, creating fallback Markdown")
            return create_fallback_markdown_file(paper, safe, "Unable to get PDF URL")
        
        print(f"[DEBUG] PDF URL: {pdf_url}")
        pdf_path = os.path.join(papers_dir, f"{safe}.pdf")
    
        try:
            if not os.path.exists(pdf_path):
                print(f"[DEBUG] Starting PDF download to: {pdf_path}")
                
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                    'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
                    'Accept-Language': 'en-US,en;q=0.9',
                    'Accept-Encoding': 'gzip, deflate',
                    'Connection': 'keep-alive',
                    'Upgrade-Insecure-Requests': '1'
                }
                
                parsed = urllib.parse.urlparse(pdf_url)
                host = (parsed.hostname or '').lower()
                use_direct = (host in ARXIV_HOSTS) or any(host.endswith('.' + h) for h in ARXIV_HOSTS)
                
                print(f"[DEBUG] Using proxy mode: {'DIRECT_OPENER' if use_direct else 'urlopen (with proxy)'}")
                
                opener_open = ARXIV_DIRECT_OPENER.open if use_direct else urllib.request.urlopen
                
                max_retries = 3
                retry_delay = 2 
                
                for attempt in range(max_retries):
                    try:
                        if attempt > 0:
                            wait_time = retry_delay * (attempt + 1) + random.uniform(0.5, 2.0)
                            print(f"[INFO] Waiting {wait_time:.1f} seconds before retry (attempt {attempt + 1}/{max_retries})...")
                            time.sleep(wait_time)
                        
                        print(f"[DEBUG] Sending HTTP request (attempt {attempt + 1}/{max_retries})...")
                        req = urllib.request.Request(pdf_url, headers=headers)
                        
                        with opener_open(req, timeout=60) as resp:
                            pdf_data = resp.read()
                            print(f"[DEBUG] Download complete, size: {len(pdf_data)} bytes")
                            
                            if len(pdf_data) < 100:
                                print(f"[WARNING] Downloaded file is too small ({len(pdf_data)} bytes), may not be a valid PDF")
                                if attempt < max_retries - 1:
                                    continue
                                raise ValueError(f"Downloaded file is too small: {len(pdf_data)} bytes")
                            
                            if not pdf_data.startswith(b'%PDF-'):
                                print(f"[WARNING] Downloaded file is not a valid PDF (header: {pdf_data[:20]})")
                                if attempt < max_retries - 1:
                                    continue
                                raise ValueError("Downloaded file is not a valid PDF format")
                            
                            with open(pdf_path, 'wb') as f:
                                f.write(pdf_data)
                            print(f"[SUCCESS] PDF download successful: {pdf_path}")
                            break  
                            
                    except urllib.error.HTTPError as e:
                        error_msg = f"HTTP Error {e.code}: {e.reason}"
                        print(f"[WARNING] {error_msg}")
                        
                        if e.code == 403:
                            print(f"[TIP] 403 error usually indicates:")
                            print(f"  1. Request identified as bot (using real browser User-Agent)")
                            print(f"  2. IP temporarily rate-limited (retrying...)")
                            print(f"  3. Longer request interval needed")
                        
                        if attempt == max_retries - 1:
                            raise  
                    
                    except Exception as e:
                        print(f"[WARNING] Download exception: {type(e).__name__}: {e}")
                        if attempt == max_retries - 1:
                            raise  
                
                time.sleep(random.uniform(1.0, 3.0))
                
            else:
                print(f"[DEBUG] PDF already exists, skipping download: {pdf_path}")
                
        except Exception as e:
            print(f"[ERROR] PDF download failed (all retries exhausted): {type(e).__name__}: {e}")
            print(f"[INFO] Creating fallback Markdown (metadata only)")
            return create_fallback_markdown_file(paper, safe, f"PDF download failed: {str(e)}")
        
        print(f"[DEBUG] Starting PDF to Markdown conversion...")
        md_path = pdf_to_md(pdf_path, papers_dir)
        
        if md_path and os.path.isfile(md_path):
            print(f"[SUCCESS] Markdown conversion successful: {md_path}")
            return md_path
        else:
            print(f"[WARNING] Markdown conversion failed, creating fallback Markdown")
            return create_fallback_markdown_file(paper, safe, "PDF conversion failed")
        
    except Exception as e:
        print(f"[WARNING] download_pdf_and_convert_md exception: {type(e).__name__}: {e}")
        import traceback
        traceback.print_exc()
        
        try:
            title = paper.get('title') or paper.get('arxiv_id') or 'paper'
            arxiv_id = paper.get('arxiv_id') or ''
            base_name = f"{arxiv_id}_{title[:50]}" if arxiv_id else title[:50]
            safe = _safe_filename(base_name)
            return create_fallback_markdown_file(paper, safe, f"Processing exception: {str(e)}")
        except:
            return None



_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
PROMPTS_DIR = os.path.join(_CURRENT_DIR, "prompts")


import yaml

# Mapping from old prompt names to new YAML names
PROMPT_NAME_MAPPING = {
    "1.txt": "semantic_encoder.yaml",
    "2.txt": "issue_extractor.yaml",
    "2_c.txt": "issue_extractor_checker.yaml",
    "3.txt": "literature_retrieval.yaml",
    "4.txt": "reference_filter.yaml",
    "5.txt": "reference_analyzer.yaml",
    "6.txt": "strategy_generator.yaml",
    "7.txt": "strategy_reviewer.yaml",
    "7_h.txt": "strategy_human_refinement.yaml",
    "8.txt": "rebuttal_writer.yaml",
    "9.txt": "rebuttal_reviewer.yaml",
}

def load_prompt(name: str) -> str:
    """Load prompt from YAML or TXT file.
    
    Supports both new YAML format and legacy TXT format.
    For YAML files, extracts the 'prompt' field.
    """
    # Map old names to new names
    mapped_name = PROMPT_NAME_MAPPING.get(name, name)
    prompt_path = os.path.join(PROMPTS_DIR, mapped_name)
    
    # Try YAML first, then fall back to original name
    if not os.path.exists(prompt_path):
        prompt_path = os.path.join(PROMPTS_DIR, name)
    
    if not os.path.exists(prompt_path):
        raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
    
    with open(prompt_path, "r", encoding="utf-8") as f:
        content = f.read()
    
    # If it's a YAML file, extract the prompt field
    if prompt_path.endswith('.yaml') or prompt_path.endswith('.yml'):
        try:
            data = yaml.safe_load(content)
            if isinstance(data, dict) and 'prompt' in data:
                return data['prompt']
            return content
        except yaml.YAMLError:
            return content
    
    return content