Spaces:

unpaper
/

AddPaper

Sleeping

App Files Files Community

katsukiai commited on Feb 28, 2025

Commit

c97b47f

verified ·

1 Parent(s): d8c4dbd

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -351

app.py CHANGED Viewed

@@ -1,372 +1,152 @@
 import streamlit as st
 import requests
-import re
-import io
-import json
-import base64
-import tempfile
 from pathlib import Path
-from PyPDF2 import PdfReader
-from huggingface_hub import HfApi
-import arxiv
 import pandas as pd
-# CSS for styling
 st.markdown("""
-<style>
-    .main-header {font-size:2.5rem;color:#1E88E5;font-weight:bold;margin-bottom:1rem;}
-    .sub-header {font-size:1.5rem;color:#424242;margin-bottom:1rem;}
-    .arxiv-badge {background-color:#B31B1B;color:white;padding:0.3rem 0.6rem;border-radius:0.3rem;
-                text-decoration:none;font-weight:bold;display:inline-block;margin:0.5rem 0;}
-    .hf-badge {background-color:#FFBD45;color:black;padding:0.3rem 0.6rem;border-radius:0.3rem;
-              text-decoration:none;font-weight:bold;display:inline-block;margin:0.5rem 0.5rem 0.5rem 0;}
-    .footer {margin-top:3rem;text-align:center;color:#9E9E9E;}
-</style>
 """, unsafe_allow_html=True)
-# App title
-st.markdown('<div class="main-header">arXiv Paper Analyzer & HF Space Converter</div>', unsafe_allow_html=True)
-# Sidebar configuration
-st.sidebar.title("Settings")
-hf_token = st.sidebar.text_input("Hugging Face Token", type="password")
-username = st.sidebar.text_input("Hugging Face Username")
-space_name = st.sidebar.text_input("Space Name")
-if username and space_name:
-    hf_space_id = f"{username}/{space_name}"
-else:
-    hf_space_id = f"unpaper/{space_name}" if space_name else "unpaper/default-space"
-st.sidebar.markdown(f"**Space ID**: `{hf_space_id}`")
-# Model settings
-model_options = ["deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"]
-selected_model = st.sidebar.selectbox("Select AI Model", model_options)
-# Analysis options
-analysis_options = st.sidebar.multiselect(
-    "Analysis Options",
-    ["Author Analysis", "Paper Summary", "Key Findings", "Methodology", "Citations"],
-    default=["Author Analysis", "Paper Summary"]
-)
-# Display options
-enable_badges = st.sidebar.checkbox("Enable arXiv & HF Badges", value=True)
 # Functions
-def fetch_arxiv_paper(arxiv_id):
     client = arxiv.Client()
-    search = arxiv.Search(id_list=[arxiv_id])
-    result = list(client.results(search))
-    if not result:
-        return None
-    paper = result[0]
-    return {
-        'title': paper.title,
-        'authors': [author.name for author in paper.authors],
-        'summary': paper.summary,
-        'published': paper.published,
-        'pdf_url': paper.pdf_url,
-        'arxiv_id': arxiv_id
-    }
-def download_pdf(url):
-    response = requests.get(url)
-    if response.status_code == 200:
-        return io.BytesIO(response.content)
-    return None
-def extract_text_from_pdf(pdf_file):
-    reader = PdfReader(pdf_file)
     text = ""
-    for page in reader.pages:
-        text += page.extract_text() + "\n"
     return text
-def analyze_authors(pdf_text, authors_list):
-    authors_data = []
-    for author in authors_list:
-        mentions = len(re.findall(re.escape(author), pdf_text, re.IGNORECASE))
-        email_match = re.search(r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', pdf_text)
-        email = email_match.group(1) if email_match else "Not found"
-        sections_with_author = []
-        for section in pdf_text.split('\n\n'):
-            if author.lower() in section.lower():
-                sections_with_author.append(section[:100] + "...")
-        authors_data.append({
-            'name': author,
-            'mentions': mentions,
-            'email': email,
-            'sections': sections_with_author[:3]
-        })
-    return authors_data
-def generate_arxiv_badge(arxiv_id):
-    return f"""<a href="https://arxiv.org/abs/{arxiv_id}" target="_blank" class="arxiv-badge">arXiv:{arxiv_id}</a>"""
-def generate_hf_badge(space_id):
-    return f"""<a href="https://huggingface.co/spaces/{space_id}" target="_blank" class="hf-badge">HF Space</a>"""
-def analyze_pdf_with_ai(pdf_text, model_name, analysis_type="summary"):
-    # Mock implementation (would be replaced with actual API call)
-    analysis_responses = {
-        "Author Analysis": "The paper has multiple authors with different contributions. The main author appears to lead the research direction.",
-        "Paper Summary": "This paper introduces a novel approach to analyzing scientific literature using transformer-based models.",
-        "Key Findings": "The approach demonstrates significant improvements over previous methods in several benchmark datasets.",
-        "Methodology": "The authors employ a multi-stage pipeline combining preprocessing, feature extraction, and classification.",
-        "Citations": "The paper cites 47 previous works, primarily from the last 5 years in the NLP and ML domains."
     }
-    return analysis_responses.get(analysis_type, f"Analysis for {analysis_type}")
-def deploy_to_huggingface(paper_data, pdf_content, analysis_results, hf_token, space_id):
-    try:
-        api = HfApi(token=hf_token)
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # Create the Space structure
-            readme_content = f"""# {paper_data['title']}
-This is an interactive analysis of the arXiv paper [{paper_data['arxiv_id']}](https://arxiv.org/abs/{paper_data['arxiv_id']}).
-## Authors
-{", ".join(paper_data['authors'])}
-## Summary
-{paper_data['summary'][:500]}...
-"""
-            with open(f"{temp_dir}/README.md", "w") as f:
-                f.write(readme_content)
-            # Save the PDF
-            with open(f"{temp_dir}/paper.pdf", "wb") as f:
-                f.write(pdf_content.getvalue())
-            # Create app.py for Streamlit
-            streamlit_app = f"""
-import streamlit as st
-import base64
-import json
-# Load analysis results
-with open("analysis.json", "r") as f:
-    analysis = json.load(f)
-st.title(analysis["title"])
-st.markdown(f"**Authors:** {{', '.join(analysis['authors'])}}")
-st.markdown(f"**arXiv ID:** [{{analysis['arxiv_id']}}](https://arxiv.org/abs/{{analysis['arxiv_id']}})")
-# Display PDF
-def show_pdf(file_path):
-    with open(file_path,"rb") as f:
-        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
-    pdf_display = f'<iframe src="data:application/pdf;base64,{{{{base64_pdf}}}}' + \
-                  f'" width="700" height="1000" type="application/pdf"></iframe>'
-    st.markdown(pdf_display, unsafe_allow_html=True)
-# Add tabs for different analyses
-tab1, tab2, tab3 = st.tabs(["Paper", "Summary", "Author Analysis"])
-with tab1:
-    show_pdf("paper.pdf")
-with tab2:
-    st.markdown(analysis["ai_analysis"]["Paper Summary"])
-with tab3:
-    for author in analysis["authors_analysis"]:
-        with st.expander(author["name"]):
-            st.write(f"Mentions in paper: {{author['mentions']}}")
-            st.write(f"Email: {{author['email']}}")
-            st.write("Sections mentioning this author:")
-            for section in author['sections']:
-                st.markdown(f"- {{section}}")
-"""
-            with open(f"{temp_dir}/app.py", "w") as f:
-                f.write(streamlit_app)
-            # Save analysis results
-            analysis_json = {
-                "title": paper_data['title'],
-                "authors": paper_data['authors'],
-                "arxiv_id": paper_data['arxiv_id'],
-                "published": str(paper_data['published']),
-                "summary": paper_data['summary'],
-                "authors_analysis": analysis_results["authors_data"],
-                "ai_analysis": analysis_results["ai_analysis"]
-            }
-            with open(f"{temp_dir}/analysis.json", "w") as f:
-                json.dump(analysis_json, f, indent=2)
-            # Create requirements.txt
-            requirements = """streamlit==1.26.0
-PyPDF2==3.0.1
-pandas==2.0.3
-numpy==1.24.3"""
-            with open(f"{temp_dir}/requirements.txt", "w") as f:
-                f.write(requirements)
-            # Create SDK configuration
-            sdk_config = {
-                "title": f"Analysis of arXiv:{paper_data['arxiv_id']}",
-                "emoji": "📚",
-                "colorFrom": "blue",
-                "colorTo": "indigo",
-                "sdk": "static",
-                "app_file": "app.py",
-                "pinned": False
-            }
-            # Upload to Hugging Face
-            space_url = api.create_repo(
-                repo_id=space_id,
-                repo_type="space",
-                space_sdk="streamlit",
-                private=False
-            )
-            # Upload files
-            for file_path in Path(temp_dir).glob("*"):
-                api.upload_file(
-                    path_or_fileobj=str(file_path),
-                    path_in_repo=file_path.name,
-                    repo_id=space_id,
-                    repo_type="space"
-                )
-            return f"https://huggingface.co/spaces/{space_id}"
-    except Exception as e:
-        st.error(f"Error deploying to Hugging Face: {str(e)}")
-        return None
-# Main app interface
-tab1, tab2 = st.tabs(["arXiv ID", "Upload PDF"])
-with tab1:
-    col1, col2 = st.columns([3, 1])
-    with col1:
-        arxiv_id = st.text_input("Enter arXiv ID (e.g. 2302.13971)", "2302.13971")
-    with col2:
-        fetch_button = st.button("Fetch Paper")
-    if fetch_button:
-        if not arxiv_id:
-            st.warning("Please enter a valid arXiv ID")
-        else:
-            with st.spinner("Fetching paper from arXiv..."):
-                paper_data = fetch_arxiv_paper(arxiv_id)
-                if paper_data:
-                    st.session_state['paper_data'] = paper_data
-                    st.session_state['pdf_source'] = 'arxiv'
-                    # Display paper info
-                    st.markdown(f"## {paper_data['title']}")
-                    st.markdown(f"**Authors**: {', '.join(paper_data['authors'])}")
-                    if enable_badges:
-                        st.markdown(generate_arxiv_badge(arxiv_id), unsafe_allow_html=True)
-                    with st.spinner("Downloading PDF..."):
-                        pdf_content = download_pdf(paper_data['pdf_url'])
-                        if pdf_content:
-                            st.session_state['pdf_content'] = pdf_content
-                            st.success("PDF successfully downloaded")
-                        else:
-                            st.error("Failed to download PDF")
-                else:
-                    st.error("Failed to fetch paper with the provided arXiv ID")
-with tab2:
-    uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
-    if uploaded_file:
-        st.session_state['pdf_content'] = uploaded_file
-        st.session_state['pdf_source'] = 'upload'
-        # Try to extract basic info from PDF
-        with st.spinner("Extracting information from PDF..."):
-            pdf_text = extract_text_from_pdf(uploaded_file)
-            # Extract title (simplified approach)
-            title_match = re.search(r'^(.+?)(?:\n|$)', pdf_text)
-            title = title_match.group(1) if title_match else "Unknown Title"
-            # Extract authors (simplified approach)
-            authors_line = re.search(r'(?<=\n)(.+?)(?=\n)', pdf_text)
-            authors = authors_line.group(1).split(',') if authors_line else ["Unknown Author"]
-            paper_data = {
-                'title': title,
-                'authors': authors,
-                'summary': pdf_text[:500] + "...",
-                'published': "N/A",
-                'pdf_url': "N/A",
-                'arxiv_id': "custom_upload"
-            }
-            st.session_state['paper_data'] = paper_data
-            st.markdown(f"## {paper_data['title']}")
-            st.markdown(f"**Authors**: {', '.join(paper_data['authors'])}")
-# Analysis and conversion section
-if 'pdf_content' in st.session_state and 'paper_data' in st.session_state:
-    st.markdown("---")
-    st.markdown('<div class="sub-header">Paper Analysis & HF Space Conversion</div>', unsafe_allow_html=True)
-    if st.button("Analyze Paper and Create HF Space"):
-        if not hf_token or not space_name:
-            st.warning("Please provide your Hugging Face token and space name in the sidebar")
-        else:
-            with st.spinner("Processing PDF and analyzing content..."):
-                # Reset file pointer for reading
-                st.session_state['pdf_content'].seek(0)
-                pdf_text = extract_text_from_pdf(st.session_state['pdf_content'])
-                # Analyze authors
-                authors_data = analyze_authors(pdf_text, st.session_state['paper_data']['authors'])
-                # AI analysis of the paper
-                ai_analysis = {}
-                for analysis_type in analysis_options:
-                    with st.spinner(f"Performing {analysis_type}..."):
-                        ai_analysis[analysis_type] = analyze_pdf_with_ai(
-                            pdf_text, selected_model, analysis_type
-                        )
-                # Combine all analysis results
-                analysis_results = {
-                    "authors_data": authors_data,
-                    "ai_analysis": ai_analysis
-                }
-                # Display results
-                for analysis_type in analysis_options:
-                    with st.expander(analysis_type, expanded=True):
-                        st.markdown(ai_analysis[analysis_type])
-                        if analysis_type == "Author Analysis":
-                            for author in authors_data:
-                                with st.expander(author["name"]):
-                                    st.write(f"Mentions: {author['mentions']}")
-                                    st.write(f"Email: {author['email']}")
-                # Deploy to Hugging Face Space
-                st.markdown("### Deploying to Hugging Face Space")
-                with st.spinner("Creating and deploying Hugging Face Space..."):
-                    st.session_state['pdf_content'].seek(0)
-                    space_url = deploy_to_huggingface(
-                        st.session_state['paper_data'],
-                        st.session_state['pdf_content'],
-                        analysis_results,
-                        hf_token,
-                        hf_space_id
-                    )
-                    if space_url:
-                        st.success(f"Successfully deployed to Hugging Face Space!")
-                        st.markdown(f"**Space URL**: [{hf_space_id}]({space_url})")
-                        if enable_badges:
-                            st.markdown(generate_hf_badge(hf_space_id), unsafe_allow_html=True)
-                    else:
-                        st.error("Failed to deploy to Hugging Face Space")
-# Footer
-st.markdown('<div class="footer">arXiv Paper Analyzer & HF Space Converter</div>', unsafe_allow_html=True)

 import streamlit as st
+import arxiv
 import requests
+import os
 from pathlib import Path
+from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
+from huggingface_hub import login, HfApi
+import fitz  # PyMuPDF
 import pandas as pd
+from collections import Counter
+import re
+# Constants
+MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+SECONDARY_MODEL = "distilbert-base-uncased"
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
+SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
+# CSS
 st.markdown("""
+    <style>
+    .main { background-color: #f5f5f5; }
+    .sidebar .sidebar-content { background-color: #ffffff; }
+    .badge {
+        background-color: #ff4b4b;
+        color: white;
+        padding: 5px 10px;
+        border-radius: 5px;
+        display: inline-block;
+    }
+    </style>
 """, unsafe_allow_html=True)
+# Sidebar
+st.sidebar.title("arXiv Paper Converter")
+st.sidebar.header("Settings")
+arxiv_id = st.sidebar.text_input("Enter arXiv ID", "2407.21783")
+upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
+space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
+token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
+# Login to Hugging Face
+if token:
+    login(token=token)
+# Initialize models
+@st.cache_resource
+def load_models():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+    secondary_model = pipeline("text-classification", model=SECONDARY_MODEL)
+    return tokenizer, model, secondary_model
+tokenizer, model, secondary_model = load_models()
 # Functions
+def fetch_arxiv_paper(paper_id):
     client = arxiv.Client()
+    search = arxiv.Search(id_list=[paper_id])
+    paper = next(client.results(search))
+    return paper
+def download_pdf(paper, filename):
+    paper.download_pdf(filename=filename)
+    return filename
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
     text = ""
+    for page in doc:
+        text += page.get_text()
     return text
+def analyze_authors(text):
+    author_pattern = r"Author[s]?:\s*(.+?)(?:\n|$)"
+    authors = re.findall(author_pattern, text, re.IGNORECASE)
+    author_list = []
+    for author in authors:
+        names = author.split(',')
+        author_list.extend([name.strip() for name in names])
+    return Counter(author_list)
+def process_text_with_models(text):
+    inputs = tokenizer(text[:512], return_tensors="pt", truncation=True)
+    outputs = model(**inputs)
+    secondary_results = secondary_model(text[:512])
+    return outputs, secondary_results
+def create_huggingface_space(space_name, metadata):
+    api = HfApi()
+    api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
+    api.upload_file(
+        path_or_fileobj="README.md",
+        path_in_repo="README.md",
+        repo_id=space_name,
+        repo_type="space"
+    )
+    return f"https://huggingface.co/spaces/{space_name}"
+# Main App
+st.title("arXiv Paper to Hugging Face Space Converter")
+st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)
+# Process arXiv or PDF
+if arxiv_id or upload_pdf:
+    if upload_pdf:
+        pdf_path = "temp.pdf"
+        with open(pdf_path, "wb") as f:
+            f.write(upload_pdf.getbuffer())
+    else:
+        paper = fetch_arxiv_paper(arxiv_id)
+        pdf_path = download_pdf(paper, "temp.pdf")
+    # Extract and analyze
+    text = extract_text_from_pdf(pdf_path)
+    author_analysis = analyze_authors(text)
+    model_outputs, secondary_outputs = process_text_with_models(text)
+    # Display results
+    st.header("Paper Analysis")
+    st.subheader("Authors")
+    st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))
+    st.subheader("AI Analysis")
+    st.write("Primary Model Outputs:", model_outputs.logits)
+    st.write("Secondary Model Outputs:", secondary_outputs)
+    # Metadata
+    metadata = {
+        "title": paper.title if arxiv_id else "Uploaded PDF",
+        "authors": list(author_analysis.keys()),
+        "arxiv_id": arxiv_id if arxiv_id else "N/A",
+        "model_analysis": {
+            "primary": str(model_outputs.logits),
+            "secondary": str(secondary_outputs)
+        }
     }
+    # Create Space
+    if st.button("Create Hugging Face Space"):
+        space_url = create_huggingface_space(space_name, metadata)
+        st.success(f"Space created: {space_url}")
+        st.markdown(f"""
+            <a href="{space_url}" target="_blank">
+                <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
+                     alt="Hugging Face Space" width="150">
+            </a>
+        """, unsafe_allow_html=True)
+# Cleanup
+if os.path.exists("temp.pdf"):
+    os.remove("temp.pdf")