Spaces:

rodrigocarrillo
/

ReviewersEmail

Running

App Files Files Community

rodrigocarrillo commited on about 1 month ago

Commit

44b5e1e

verified ·

1 Parent(s): dee3475

Create app.py

Browse files

Files changed (1) hide show

app.py +680 -0

app.py ADDED Viewed

	@@ -0,0 +1,680 @@

+import re
+import os
+import json
+import time
+from pymed import PubMed
+from copy import deepcopy
+from IPython.display import Markdown, display
+from langchain_google_genai import ChatGoogleGenerativeAI
+import gradio as gr
+import json
+from typing import Tuple
+SYSTEM_PROMPT_GET_TITLES_FROM_LIST_REFERENCES = """
+You are a helpful assistant that extracts information from scientific paper references.
+Given a list of paper references, identify the titles of the papers in these references.
+Omit the non-scientific papers in the list (e.g., websites or books)
+Return your response as a JSON array of objects with the following fields:
+- title: The title of the paper.
+Ensure the JSON is properly formatted.
+Do not include any text outside the JSON structure.
+Do not include any additional text, commentary, or explanation.
+"""
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+llm_Gemini_25_pro = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.0, google_api_key=GOOGLE_API_KEY)
+def list_of_papers(list_of_papers_to_parse: str, max_retries: int = 3, retry_delay: int = 5) -> str:
+    """
+    Extracts paper titles from a list of references using an LLM.
+    Args:
+        list_of_papers_to_parse: String containing list of paper references
+        max_retries: Maximum number of retry attempts for API calls
+        retry_delay: Seconds to wait between retries
+    Returns:
+        JSON string with paper titles, or error message
+    """
+    # Input validation. Return error if input is empty.
+    if not list_of_papers_to_parse or not list_of_papers_to_parse.strip():
+        return json.dumps({"error": "Empty input provided. Please provide a list of references."})
+    # Try the LLM call with retries in case of failures.
+    for attempt in range(max_retries):
+        try:
+            response = llm_Gemini_25_pro.invoke([   # Call the LLM.
+                {"role": "system", "content": SYSTEM_PROMPT_GET_TITLES_FROM_LIST_REFERENCES},    # System prompt.
+                {"role": "user", "content": f"List of references:\n{list_of_papers_to_parse}"}   # User prompt.
+            ])
+            # Check if response is valid
+            if not response or not hasattr(response, 'content'):   # Validate response object.
+                raise ValueError("Invalid response from LLM")   # If response doesn't exist or it doesn't have 'content', raise error.
+            content = response.content.strip()   # Get content and strip whitespace.
+            # Check for empty response
+            if not content:
+                raise ValueError("LLM returned empty response")   # If content is empty, raise error.
+            # Parse the answer. Strip markdown code fences if present.
+            if content.startswith("```"):
+                content = re.sub(r'^```(?:json)?\s*\n', '', content)
+                content = re.sub(r'\n```\s*$', '', content)
+            content = content.strip()
+            # Validate that the output it's proper JSON.
+            try:
+                json.loads(content)  # Test if valid JSON
+                return content
+            except json.JSONDecodeError as e:
+                raise ValueError(f"LLM returned invalid JSON: {str(e)}")
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries} failed: {str(e)}")
+            if attempt < max_retries - 1:
+                print(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+            else:
+                # Final attempt failed
+                error_message = {
+                    "error": "LLM service is currently unavailable",
+                    "message": "The service failed after multiple attempts. Please try again later.",
+                    "details": str(e)
+                }
+                return json.dumps(error_message, indent=2)
+    # This shouldn't be reached, but just in case
+    return json.dumps({"error": "Unexpected error occurred"})
+def fetch_paper_authors_from_pubmed(papers: list, delay: int=5, max_results: int=1, verbose: bool=True) -> list:
+    """
+    Fetch authors for each paper from PubMed.
+    """
+    pubmed = PubMed(tool="MyTool", email="rodrigo@gmail.com")
+    all_results = []
+    for i in range(len(papers)):
+        if verbose:
+            print(f"Processing paper {i+1}/{len(papers)}: {papers[i]['title']}")
+        time.sleep(delay)
+        results = pubmed.query(papers[i]['title'] + '[title]', max_results=max_results)
+        authors_for_this_paper = [article.authors for article in results]
+        if not authors_for_this_paper:
+            authors_for_this_paper = "No authors found"
+        all_results.append({
+            "paper_title": papers[i]["title"],
+            "authors": authors_for_this_paper
+        })
+    return all_results
+email_regex = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
+def contains_email_symbol(obj) -> bool:
+    """Return True if '@' appears anywhere in the nested structure (str/list/dict)."""
+    if isinstance(obj, str):
+        return "@" in obj
+    if isinstance(obj, dict):
+        # check each value
+        return any(contains_email_symbol(v) for v in obj.values())
+    if isinstance(obj, list) or isinstance(obj, tuple):
+        return any(contains_email_symbol(item) for item in obj)
+    return False
+def extract_emails_from_obj(obj):
+    """Return list of email strings found anywhere in obj."""
+    found = set()
+    if isinstance(obj, str):
+        for m in email_regex.findall(obj):
+            found.add(m)
+    elif isinstance(obj, dict):
+        for v in obj.values():
+            found.update(extract_emails_from_obj(v))
+    elif isinstance(obj, (list, tuple)):
+        for item in obj:
+            found.update(extract_emails_from_obj(item))
+    return list(found)
+def flatten_one_or_many(authors):
+    """Recursively flatten nested lists/tuples into a single list of non-list elements."""
+    out = []
+    if isinstance(authors, (list, tuple)):
+        for item in authors:
+            if isinstance(item, (list, tuple)):
+                out.extend(flatten_one_or_many(item))
+            else:
+                out.append(item)
+    else:
+        out.append(authors)
+    return out
+def filter_all_results_keep_only_email_authors(all_results):
+    """
+    Given list of dicts like {'paper_title':..., 'authors': ...},
+    return a new list keeping only entries that have >=1 author with an email.
+    Within each kept entry, authors without emails are removed.
+    Duplicate emails are removed across all authors in the paper.
+    """
+    filtered_results = []
+    for entry in all_results:
+        authors_raw = entry.get("authors", [])
+        authors_flat = flatten_one_or_many(authors_raw)
+        authors_with_email = []
+        seen_emails = set()  # track emails already added for this paper
+        for a in authors_flat:
+            if contains_email_symbol(a):
+                emails = extract_emails_from_obj(a)
+                # remove duplicates per paper
+                emails_unique = [e for e in emails if e not in seen_emails]
+                if emails_unique:  # only keep if new emails
+                    seen_emails.update(emails_unique)
+                    if isinstance(a, dict):
+                        a_copy = deepcopy(a)
+                        a_copy["_found_emails"] = emails_unique
+                        authors_with_email.append(a_copy)
+                    else:
+                        authors_with_email.append({"raw": a, "_found_emails": emails_unique})
+        if authors_with_email:
+            new_entry = deepcopy(entry)
+            new_entry["authors"] = authors_with_email
+            filtered_results.append(new_entry)
+    return filtered_results
+def display_markdown_filtered_results(filtered_results):
+    """Display authors with emails nicely formatted (full name, affiliation, emails)."""
+    md_text = ""
+    for paper in filtered_results:
+        authors_nested = paper.get("authors", [])
+        # flatten authors list if nested
+        authors_flat = []
+        for a in authors_nested:
+            if isinstance(a, list):
+                authors_flat.extend(a)
+            else:
+                authors_flat.append(a)
+        # only keep authors that have '_found_emails'
+        authors_with_email = []
+        for author in authors_flat:
+            if isinstance(author, dict):
+                emails = author.get("_found_emails", [])
+                if emails:
+                    full_name = f"{author.get('firstname','')} {author.get('lastname','')}".strip()
+                    affiliation = author.get("affiliation", "").strip()
+                    authors_with_email.append((full_name, affiliation, emails))
+        # build markdown text
+        for name, affiliation, emails in authors_with_email:
+            md_text += f"- **Author:** {name}\n"
+            md_text += f"  - **Affiliation:** {affiliation}\n"
+            md_text += f"  - **Email(s):** {', '.join(emails)}\n\n"
+    display(Markdown(md_text))
+class PaperReviewerExtractor:
+    """
+    Main class to extract reviewer emails from a list of paper references.
+    This class orchestrates the entire workflow:
+    1. Extract paper titles from a reference list using LLM
+    2. Fetch author information from PubMed
+    3. Filter authors that have email addresses
+    4. Return structured results
+    """
+    def __init__(self):
+        self.google_api_key = 'GOOGLE_API_KEY'
+        self.llm = ChatGoogleGenerativeAI(
+            model="gemini-2.5-flash",
+            temperature=0.0,
+            google_api_key=self.google_api_key
+        )
+    def run(
+        self,
+        reference_list: str,
+        pubmed_delay: int = 5,
+        pubmed_max_results: int = 1,
+        llm_max_retries: int = 3,
+        llm_retry_delay: int = 5,
+        verbose: bool = True
+    ) -> dict:
+        """
+        Execute the complete pipeline to extract reviewer emails from references.
+        Args:
+            reference_list: String containing paper references
+            pubmed_delay: Delay in seconds between PubMed API calls
+            pubmed_max_results: Maximum results per PubMed query
+            llm_max_retries: Maximum retry attempts for LLM calls
+            llm_retry_delay: Delay in seconds between LLM retries
+            verbose: Whether to print progress messages
+        Returns:
+            Dictionary with keys:
+                - 'status': 'success' or 'error'
+                - 'papers': List of paper titles extracted
+                - 'authors_with_emails': Filtered authors with email addresses
+                - 'raw_authors': All authors found before email filtering
+                - 'error': Error message if status is 'error'
+        """
+        try:
+            if verbose:
+                print("Step 1: Extracting paper titles from references...")
+            # Step 1: Extract paper titles using LLM
+            papers_json = self._extract_paper_titles(
+                reference_list,
+                max_retries=llm_max_retries,
+                retry_delay=llm_retry_delay
+            )
+            papers = json.loads(papers_json)
+            # Check for error response
+            if isinstance(papers, dict) and "error" in papers:
+                return {
+                    'status': 'error',
+                    'papers': [],
+                    'authors_with_emails': [],
+                    'raw_authors': [],
+                    'error': papers.get('message', papers.get('error', 'Failed to extract paper titles'))
+                }
+            # Ensure papers is a list
+            if not isinstance(papers, list):
+                return {
+                    'status': 'error',
+                    'papers': [],
+                    'authors_with_emails': [],
+                    'raw_authors': [],
+                    'error': f'Expected list of papers, got {type(papers).__name__}'
+                }
+            if verbose:
+                print(f"✓ Found {len(papers)} papers\n")
+                print("Step 2: Fetching authors from PubMed...")
+            # Step 2: Fetch authors from PubMed
+            all_authors = self._fetch_authors_from_pubmed(
+                papers,
+                delay=pubmed_delay,
+                max_results=pubmed_max_results,
+                verbose=verbose
+            )
+            if verbose:
+                print(f"✓ Fetched authors for {len(all_authors)} papers\n")
+                print("Step 3: Filtering authors with email addresses...")
+            # Step 3: Filter authors with emails using the existing helper function
+            authors_with_emails = filter_all_results_keep_only_email_authors(all_authors)
+            if verbose:
+                print(f"✓ Found {len(authors_with_emails)} papers with authors having email addresses\n")
+                print("Pipeline complete!")
+            return {
+                'status': 'success',
+                'papers': papers,
+                'authors_with_emails': authors_with_emails,
+                'raw_authors': all_authors,
+                'error': None
+            }
+        except Exception as e:
+            error_msg = f"Pipeline error: {str(e)}"
+            if verbose:
+                print(f"✗ {error_msg}")
+            return {
+                'status': 'error',
+                'papers': [],
+                'authors_with_emails': [],
+                'raw_authors': [],
+                'error': error_msg
+            }
+    def _extract_paper_titles(
+        self,
+        reference_list: str,
+        max_retries: int = 3,
+        retry_delay: int = 5
+    ) -> str:
+        """Extract paper titles using the LLM."""
+        if not reference_list or not reference_list.strip():
+            return json.dumps({"error": "Empty input provided."})
+        for attempt in range(max_retries):
+            try:
+                response = self.llm.invoke([
+                    {"role": "system", "content": SYSTEM_PROMPT_GET_TITLES_FROM_LIST_REFERENCES},
+                    {"role": "user", "content": f"List of references:\n{reference_list}"}
+                ])
+                if not response or not hasattr(response, 'content'):
+                    raise ValueError("Invalid response from LLM")
+                content = response.content.strip()
+                if not content:
+                    raise ValueError("LLM returned empty response")
+                # Parse the answer and strip markdown code fences if present
+                if content.startswith("```"):
+                    content = re.sub(r'^```(?:json)?\s*\n', '', content)
+                    content = re.sub(r'\n```\s*$', '', content)
+                content = content.strip()
+                # Validate JSON
+                try:
+                    json.loads(content)
+                    return content
+                except json.JSONDecodeError as e:
+                    raise ValueError(f"Invalid JSON from LLM: {str(e)}")
+            except Exception as e:
+                if attempt < max_retries - 1:
+                    time.sleep(retry_delay)
+                else:
+                    return json.dumps({
+                        "error": "LLM service unavailable",
+                        "details": str(e)
+                    })
+        return json.dumps({"error": "Unexpected error"})
+    def _fetch_authors_from_pubmed(
+        self,
+        papers: list,
+        delay: int = 5,
+        max_results: int = 1,
+        verbose: bool = True
+    ) -> list:
+        """Fetch authors for each paper from PubMed."""
+        pubmed = PubMed(tool="MyTool", email="rodrigo@gmail.com")
+        all_results = []
+        for i in range(len(papers)):
+            if verbose:
+                print(f"  Processing paper {i+1}/{len(papers)}: {papers[i]['title']}")
+            time.sleep(delay)
+            results = pubmed.query(papers[i]['title'] + '[title]', max_results=max_results)
+            authors_for_this_paper = [article.authors for article in results]
+            if not authors_for_this_paper:
+                authors_for_this_paper = "No authors found"
+            all_results.append({
+                "paper_title": papers[i]["title"],
+                "authors": authors_for_this_paper
+            })
+        return all_results
+# Re-initialize the extractor (filter function will be auto-resolved from globals)
+extractor = PaperReviewerExtractor()
+def format_authors_as_markdown(filtered_results):
+    """Format authors with emails as nicely formatted markdown (matching cell 12 style)."""
+    md_text = ""
+    for paper in filtered_results:
+        authors_nested = paper.get("authors", [])
+        # flatten authors list if nested
+        authors_flat = []
+        for a in authors_nested:
+            if isinstance(a, list):
+                authors_flat.extend(a)
+            else:
+                authors_flat.append(a)
+        # only keep authors that have '_found_emails'
+        authors_with_email = []
+        for author in authors_flat:
+            if isinstance(author, dict):
+                emails = author.get("_found_emails", [])
+                if emails:
+                    full_name = f"{author.get('firstname','')} {author.get('lastname','')}".strip()
+                    affiliation = author.get("affiliation", "").strip()
+                    authors_with_email.append((full_name, affiliation, emails))
+        # build markdown text
+        if authors_with_email:
+            paper_title = paper.get("paper_title", "Unknown Paper")
+            md_text += f"## {paper_title}\n\n"
+            for name, affiliation, emails in authors_with_email:
+                md_text += f"- **Author:** {name}\n"
+                md_text += f"  - **Affiliation:** {affiliation}\n"
+                md_text += f"  - **Email(s):** {', '.join(emails)}\n\n"
+    return md_text if md_text else "No authors with email addresses found."
+def process_references(
+    reference_list: str,
+    pubmed_delay: int,
+    pubmed_max_results: int,
+    llm_max_retries: int,
+    llm_retry_delay: int
+) -> Tuple[str, str, str]:
+    """
+    Process references and return results in displayable format.
+    Args:
+        reference_list: The paste of paper references
+        pubmed_delay: Delay between PubMed API calls
+        pubmed_max_results: Max results per PubMed query
+        llm_max_retries: Max LLM retry attempts
+        llm_retry_delay: Delay between LLM retries
+    Returns:
+        Tuple of (papers_json, authors_markdown, status_message)
+    """
+    # Run the pipeline
+    result = extractor.run(
+        reference_list=reference_list,
+        pubmed_delay=pubmed_delay,
+        pubmed_max_results=pubmed_max_results,
+        llm_max_retries=llm_max_retries,
+        llm_retry_delay=llm_retry_delay,
+        verbose=True
+    )
+    if result['status'] == 'error':
+        error_msg = f"❌ Error: {result['error']}"
+        return "", "", error_msg
+    # Format papers output as JSON
+    papers_display = json.dumps(result['papers'], indent=2)
+    # Format authors with emails as nice markdown (not JSON)
+    authors_display = format_authors_as_markdown(result['authors_with_emails'])
+    # Create status message
+    status_msg = f"""
+✅ **Pipeline Completed Successfully!**
+📊 Summary:
+- Papers found: {len(result['papers'])}
+- Authors with emails: {len(result['authors_with_emails'])}
+"""
+    return papers_display, authors_display, status_msg
+# Create Gradio interface
+with gr.Blocks(title="Paper Reviewer Email Extractor") as demo:
+    gr.Markdown("""
+    # 📚 Paper Reviewer Email Extractor
+    > **Instructions & Rationale:**
+    >
+    > [PLACEHOLDER: Add your instructions and rationale here. Explain the purpose of this tool, how to use it, and the scientific/research justification for the work.]
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Input Configuration")
+            # Reference list input
+            reference_input = gr.Textbox(
+                label="Paper References",
+                placeholder="Paste your list of paper references here...",
+                lines=10,
+                info="Provide a list of scientific paper references in any format"
+            )
+            with gr.Row():
+                pubmed_delay = gr.Slider(
+                    minimum=1,
+                    maximum=30,
+                    value=5,
+                    step=1,
+                    label="PubMed Delay (seconds)",
+                    info="Delay between PubMed API calls"
+                )
+                pubmed_max_results = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=1,
+                    step=1,
+                    label="PubMed Max Results",
+                    info="Maximum results per PubMed query"
+                )
+            with gr.Row():
+                llm_max_retries = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=3,
+                    step=1,
+                    label="LLM Max Retries",
+                    info="Maximum retry attempts for LLM calls"
+                )
+                llm_retry_delay = gr.Slider(
+                    minimum=1,
+                    maximum=30,
+                    value=5,
+                    step=1,
+                    label="LLM Retry Delay (seconds)",
+                    info="Delay between LLM retries"
+                )
+            submit_btn = gr.Button(
+                "🚀 Extract Reviewers",
+                variant="primary",
+                size="lg"
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### Outputs")
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False,
+                lines=5
+            )
+            papers_output = gr.Textbox(
+                label="Extracted Papers",
+                interactive=False,
+                lines=10,
+                max_lines=15
+            )
+            authors_output = gr.Markdown(
+                label="Authors with Email Addresses",
+                value="Results will appear here..."
+            )
+    # Connect button click to processing function
+    submit_btn.click(
+        fn=process_references,
+        inputs=[
+            reference_input,
+            pubmed_delay,
+            pubmed_max_results,
+            llm_max_retries,
+            llm_retry_delay
+        ],
+        outputs=[
+            papers_output,
+            authors_output,
+            status_output
+        ]
+    )
+# Launch the interface
+if __name__ == "__main__":
+    demo.launch(share=True)