Spaces:

C2MV
/

FreeBibTec

Paused

App Files Files Community

C2MV commited on Dec 13, 2024

Commit

574ae04

verified ·

1 Parent(s): 9986610

Create App.py

Browse files

Files changed (1) hide show

App.py +392 -0

App.py ADDED Viewed

	@@ -0,0 +1,392 @@

+import os
+import re
+import time
+import logging
+import zipfile
+import requests
+import bibtexparser
+from tqdm import tqdm
+from urllib.parse import quote, urlencode
+import gradio as gr
+from bs4 import BeautifulSoup
+import io
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+class PaperDownloader:
+    def __init__(self, output_dir='papers'):
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+         # Updated download sources
+        self.download_sources = [
+            'https://sci-hub.ee/',
+            'https://sci-hub.st/',
+            'https://sci-hub.ru/',
+            'https://sci-hub.ren/',
+            'https://sci-hub.mksa.top/',
+            'https://sci-hub.se/',
+            'https://libgen.rs/scimag/'
+        ]
+        # Request headers
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
+        }
+    def clean_doi(self, doi):
+        """Clean and encode DOI for URL"""
+        return quote(doi.strip()) if doi else None
+    def download_paper_scihub(self, doi):
+        """Improved method to download paper from Sci-Hub"""
+        if not doi:
+            logger.warning("DOI not provided")
+            return None
+        for base_url in self.download_sources:
+            try:
+                scihub_url = f"{base_url}{self.clean_doi(doi)}"
+                # Request with more tolerance
+                response = requests.get(scihub_url,
+                                        headers=self.headers,
+                                        allow_redirects=True,
+                                        timeout=15)
+                # Search for multiple PDF URL patterns
+                pdf_patterns = [
+                    r'(https?://[^\s<>"]+?\.pdf)',
+                    r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+                    r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+                ]
+                pdf_urls = []
+                for pattern in pdf_patterns:
+                    pdf_urls.extend(re.findall(pattern, response.text))
+                # Try downloading from found URLs
+                for pdf_url in pdf_urls:
+                    try:
+                        pdf_response = requests.get(pdf_url,
+                                                    headers=self.headers,
+                                                    timeout=10)
+                        # Verify if it's a PDF
+                        if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                            logger.debug(f"Found PDF from: {pdf_url}")
+                            return pdf_response.content
+                    except Exception as e:
+                        logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
+            except Exception as e:
+                logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
+        return None
+    def download_paper_libgen(self, doi):
+      """Download from Libgen, handles the query and the redirection"""
+      if not doi:
+        return None
+      base_url = 'https://libgen.rs/scimag/'
+      try:
+        search_url = f"{base_url}?q={self.clean_doi(doi)}"
+        response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
+        response.raise_for_status()
+        if "No results" in response.text:
+          logger.debug(f"No results for DOI: {doi} on libgen")
+          return None
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Find the link using a specific selector
+        links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
+        if links:
+          link = links[0]
+          pdf_url = link['href']
+          pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
+          if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+              logger.debug(f"Found PDF from: {pdf_url}")
+              return pdf_response.content
+      except Exception as e:
+            logger.debug(f"Error trying to download {doi} from libgen: {e}")
+      return None
+    def download_paper_google_scholar(self, doi):
+        """Search google scholar to find an article with the given doi, try to get the pdf"""
+        if not doi:
+            return None
+        try:
+           query = f'doi:"{doi}"'
+           params = {'q': query}
+           url = f'https://scholar.google.com/scholar?{urlencode(params)}'
+           response = requests.get(url, headers = self.headers, timeout = 10)
+           response.raise_for_status()
+           soup = BeautifulSoup(response.text, 'html.parser')
+           # Find any links with [PDF]
+           links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
+           if links:
+               pdf_url = links[0]['href']
+               pdf_response = requests.get(pdf_url, headers = self.headers, timeout=10)
+               if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                   logger.debug(f"Found PDF from: {pdf_url}")
+                   return pdf_response.content
+        except Exception as e:
+             logger.debug(f"Google Scholar error for {doi}: {e}")
+        return None
+    def download_paper_crossref(self, doi):
+        """Alternative search method using Crossref"""
+        if not doi:
+            return None
+        try:
+            # Search for open access link
+            url = f"https://api.crossref.org/works/{doi}"
+            response = requests.get(url, headers=self.headers, timeout=10)
+            if response.status_code == 200:
+                data = response.json()
+                work = data.get('message', {})
+                # Search for open access links
+                links = work.get('link', [])
+                for link in links:
+                    if link.get('content-type') == 'application/pdf':
+                        pdf_url = link.get('URL')
+                        if pdf_url:
+                            pdf_response = requests.get(pdf_url, headers=self.headers)
+                            if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                                logger.debug(f"Found PDF from: {pdf_url}")
+                                return pdf_response.content
+        except Exception as e:
+            logger.debug(f"Crossref error for {doi}: {e}")
+        return None
+    def download_with_retry(self, doi, max_retries=3, initial_delay=2):
+      """Downloads a paper using multiple strategies with exponential backoff"""
+      pdf_content = None
+      retries = 0
+      delay = initial_delay
+      while retries < max_retries and not pdf_content:
+          try:
+              pdf_content = (
+                 self.download_paper_scihub(doi) or
+                 self.download_paper_libgen(doi) or
+                 self.download_paper_google_scholar(doi) or
+                 self.download_paper_crossref(doi)
+              )
+              if pdf_content:
+                  return pdf_content
+          except Exception as e:
+              logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
+          if not pdf_content:
+            retries += 1
+            logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
+            time.sleep(delay)
+            delay *= 2 # Exponential backoff
+      return None
+    def download_single_doi(self, doi):
+        """Downloads a single paper using a DOI"""
+        if not doi:
+            return None, "Error: DOI not provided", "Error: DOI not provided"
+        try:
+            pdf_content = self.download_with_retry(doi)
+            if pdf_content:
+                filename = f"{doi.replace('/', '_').replace('.', '_')}.pdf"
+                filepath = os.path.join(self.output_dir, filename)
+                with open(filepath, 'wb') as f:
+                    f.write(pdf_content)
+                logger.info(f"Successfully downloaded: {filename}")
+                return filepath, "Successfully downloaded", ""
+            else:
+                logger.warning(f"Could not download: {doi}")
+                return None, f"Could not download {doi}", f"Could not download {doi}"
+        except Exception as e:
+            logger.error(f"Error processing {doi}: {e}")
+            return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
+    def download_multiple_dois(self, dois_text):
+        """Downloads multiple papers from a list of DOIs"""
+        if not dois_text:
+            return None, "Error: No DOIs provided", "Error: No DOIs provided"
+        dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
+        if not dois:
+            return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
+        downloaded_files = []
+        failed_dois = []
+        for doi in tqdm(dois, desc="Downloading papers"):
+             filepath, success_message, fail_message = self.download_single_doi(doi)
+             if filepath:
+                 downloaded_files.append(filepath)
+             else:
+                 failed_dois.append(doi)
+        if downloaded_files:
+            zip_filename = 'papers.zip'
+            with zipfile.ZipFile(zip_filename, 'w') as zipf:
+                for file_path in downloaded_files:
+                    zipf.write(file_path, arcname=os.path.basename(file_path))
+            logger.info(f"ZIP file created: {zip_filename}")
+        return zip_filename if downloaded_files else None, "\n".join(downloaded_files), "\n".join(failed_dois)
+    def process_bibtex(self, bib_file):
+        """Process BibTeX file and download papers with multiple strategies"""
+        # Read BibTeX file content from the uploaded object
+        try:
+            with open(bib_file.name, 'r', encoding='utf-8') as f:
+                 bib_content = f.read()
+        except Exception as e:
+             logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
+             return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
+        # Parse BibTeX data
+        try:
+            bib_database = bibtexparser.loads(bib_content)
+        except Exception as e:
+             logger.error(f"Error parsing BibTeX data: {e}")
+             return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
+        # Extract DOIs
+        dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
+        logger.info(f"Found {len(dois)} DOIs to download")
+        # Result lists
+        downloaded_files = []
+        failed_dois = []
+        # Download PDFs
+        for doi in tqdm(dois, desc="Downloading papers"):
+            try:
+                # Try to download with multiple methods with retries
+                pdf_content = self.download_with_retry(doi)
+                # Save PDF
+                if pdf_content:
+                    filename = f"{doi.replace('/', '_').replace('.', '_')}.pdf"
+                    filepath = os.path.join(self.output_dir, filename)
+                    with open(filepath, 'wb') as f:
+                        f.write(pdf_content)
+                    downloaded_files.append(filepath)
+                    logger.info(f"Successfully downloaded: {filename}")
+                else:
+                    failed_dois.append(doi)
+                    logger.warning(f"Could not download: {doi}")
+            except Exception as e:
+                failed_dois.append(doi)
+                logger.error(f"Error processing {doi}: {e}")
+        # Create ZIP of downloaded papers
+        if downloaded_files:
+            zip_filename = 'papers.zip'
+            with zipfile.ZipFile(zip_filename, 'w') as zipf:
+                for file_path in downloaded_files:
+                    zipf.write(file_path, arcname=os.path.basename(file_path))
+            logger.info(f"ZIP file created: {zip_filename}")
+        return zip_filename, "\n".join(downloaded_files), "\n".join(failed_dois)
+def create_gradio_interface():
+    """Create Gradio interface for Paper Downloader"""
+    downloader = PaperDownloader()
+    def download_papers(bib_file, doi_input, dois_input):
+        if bib_file:
+            # Check file type
+            if not bib_file.name.lower().endswith('.bib'):
+                return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
+            zip_path, downloaded_dois, failed_dois = downloader.process_bibtex(bib_file)
+            return zip_path, downloaded_dois, failed_dois, None
+        elif doi_input:
+            filepath, message, failed_doi = downloader.download_single_doi(doi_input)
+            return None, message, failed_doi, filepath
+        elif dois_input:
+           zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
+           return zip_path, downloaded_dois, failed_dois, None
+        else:
+           return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
+    # Gradio Interface
+    interface = gr.Interface(
+        fn=download_papers,
+        inputs=[
+            gr.File(file_types=['.bib'], label="Upload BibTeX File"),
+            gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
+            gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
+        ],
+        outputs=[
+            gr.File(label="Download Papers (ZIP) or Single PDF"),
+            gr.Textbox(label="Downloaded DOIs/Message"),
+            gr.Textbox(label="Failed DOIs"),
+            gr.File(label="Downloaded Single PDF")
+        ],
+        title="🔬 Academic Paper Batch Downloader",
+        description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
+        theme="soft",
+        examples=[
+            ["example.bib", None, None], # Bibtex File
+            [None, "10.1038/nature12373", None], # Single DOI
+            [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
+         ],
+         css="""
+        .gradio-container {
+            background-color: #f4f4f4;
+        }
+        .gr-interface {
+            max-width: 800px;
+            margin: 0 auto;
+        }
+        .gr-box {
+            background-color: white;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        }
+        """
+    )
+    return interface
+def main():
+    interface = create_gradio_interface()
+    interface.launch(share=True)
+if __name__ == "__main__":
+    main()