Spaces:

ai-coscientist
/

ablation-bench

Sleeping

App Files Files Community

Talor Abramovich commited on Feb 16

Commit

78a366f

1 Parent(s): 3335d02

application improved without binary assets

Browse files

Files changed (3) hide show

app.py +182 -31
prompts.yaml +3 -11
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,12 +1,18 @@
 import os
 import json
 import yaml
 from pathlib import Path
 import gradio as gr
 from huggingface_hub import InferenceClient
 from huggingface_hub.errors import BadRequestError
 def _normalize_message_content(content):
     if isinstance(content, list):
@@ -107,9 +113,142 @@ def _format_predictions_markdown(items):
     return "\n".join(out).strip("- \n")
 def run_single_interaction(
-    paper_title,
-    paper_abstract,
     message_input,
     history,
     ablation_mode,
@@ -132,12 +271,6 @@ def run_single_interaction(
         raise gr.Error("This run is complete. Click Restart to begin a new interaction.")
     prior_history = _sanitize_history(history)
-    paper_title = (paper_title or "").strip()
-    paper_abstract = (paper_abstract or "").strip()
-    if not paper_title:
-        raise gr.Error("Please add a paper title before submitting.")
-    if not paper_abstract:
-        raise gr.Error("Please add a paper abstract before submitting.")
     text = ""
     files = []
@@ -160,20 +293,19 @@ def run_single_interaction(
         raise gr.Error("Please sign in with Hugging Face before submitting.")
     file_label = None
     if has_file:
         file_item = files[0]
         file_path = file_item.get("path") if isinstance(file_item, dict) else file_item
         file_label = os.path.basename(file_path) if file_path else "uploaded_file"
-    paper_source = text if has_text else f"[uploaded file: {file_label}]"
     user_prompt_template = prompts["user_prompt"]
     user_content = (
-        user_prompt_template.replace("{{paper_title}}", paper_title)
-        .replace("{{problem_statement}}", paper_abstract)
-        .replace("{{paper_source}}", paper_source)
         .replace("{{num_ablations}}", str(num_ablations))
     )
-    user_display = f"Planning {num_ablations} ablations for paper: {paper_title}"
     client = InferenceClient(
         token=hf_token.token,
@@ -194,7 +326,7 @@ def run_single_interaction(
         )
     )
-    done_status = "Interaction complete. Click Restart to run another one."
     emitted = False
     raw_output = ""
     predictions_message_idx = None
@@ -285,6 +417,9 @@ def run_single_interaction(
             True,
         )
 def change_ablation_mode(
     ablation_mode,
@@ -302,7 +437,7 @@ def change_ablation_mode(
 def restart_interaction():
     return (
         [],
-        "Ready. Submit text or a single file.",
         False,
     )
@@ -329,6 +464,7 @@ with gr.Blocks(
     }
     """
 ) as demo:
     gr.Markdown(
         """
         # Ablation Bench
@@ -347,25 +483,33 @@ with gr.Blocks(
     status_text = gr.Markdown("Ready. Submit text or a single file.")
     restart_btn = gr.Button("↺")
-    chatbot = gr.Chatbot(label="Ablation Plan", buttons=[restart_btn])
-    interaction_locked = gr.State(False)
-    paper_title = gr.Textbox(
-        label="Paper title",
-        placeholder="Enter paper title...",
-    )
-    paper_abstract = gr.Textbox(
-        label="Paper abstract",
-        placeholder="Enter paper abstract...",
-        lines=5,
     )
     message_input = gr.MultimodalTextbox(
         label="Paper content",
         placeholder="Paste your paper content here or upload a single PDF/MD/TEX file or a single zip/gzip file of your paper.",
-        lines=10,
         file_count="single",
-        file_types=["text", ".zip", ".gz", ".pdf", ".md", ".tex"],
         max_lines=1000,
     )
@@ -442,8 +586,6 @@ with gr.Blocks(
     message_input.submit(
         run_single_interaction,
         inputs=[
-            paper_title,
-            paper_abstract,
             message_input,
             chatbot,
             ablation_mode,
@@ -470,6 +612,15 @@ with gr.Blocks(
         ],
     )
     ablation_mode.input(
         change_ablation_mode,
         inputs=[
@@ -479,7 +630,7 @@ with gr.Blocks(
             num_ablations,
         ]
     )
 if __name__ == "__main__":
     demo.launch()

 import os
 import json
 import yaml
+import gzip
+import tarfile
+import zipfile
+import tempfile
 from pathlib import Path
 import gradio as gr
 from huggingface_hub import InferenceClient
 from huggingface_hub.errors import BadRequestError
+TEXT_EXTENSIONS = {".tex", ".text", ".txt", ".bib", ".bbl", ".md"}
 def _normalize_message_content(content):
     if isinstance(content, list):
     return "\n".join(out).strip("- \n")
+def _read_text_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8", errors="ignore")
+def _is_relevant_text_file(path: Path) -> bool:
+    return path.suffix.lower() in TEXT_EXTENSIONS
+def _safe_extract_zip(zip_path: Path, output_dir: Path) -> None:
+    with zipfile.ZipFile(zip_path, "r") as zf:
+        for member in zf.infolist():
+            member_path = output_dir / member.filename
+            resolved_member = member_path.resolve()
+            resolved_root = output_dir.resolve()
+            if not str(resolved_member).startswith(str(resolved_root)):
+                continue
+            zf.extract(member, output_dir)
+def _safe_extract_tar(tar_path: Path, output_dir: Path) -> None:
+    with tarfile.open(tar_path, "r:*") as tf:
+        for member in tf.getmembers():
+            member_path = output_dir / member.name
+            resolved_member = member_path.resolve()
+            resolved_root = output_dir.resolve()
+            if not str(resolved_member).startswith(str(resolved_root)):
+                continue
+            tf.extract(member, output_dir)
+def _archive_to_tagged_source(extracted_root: Path) -> str:
+    chunks = []
+    for file_path in sorted(extracted_root.rglob("*")):
+        if not file_path.is_file() or not _is_relevant_text_file(file_path):
+            continue
+        try:
+            relative_name = file_path.relative_to(extracted_root).as_posix()
+            file_text = _read_text_file(file_path)
+        except Exception:
+            continue
+        chunks.append(f'<file name="{relative_name}">\n{file_text}\n</file>\n')
+    if not chunks:
+        raise gr.Error(
+            "No relevant text files found in the archive. Expected .tex/.text/.txt/.bib/.bbl/.md files."
+        )
+    return "\n".join(chunks)
+def _convert_pdf_to_markdown(pdf_path: Path) -> str:
+    try:
+        from marker.converters.pdf import PdfConverter
+        from marker.models import create_model_dict
+        from marker.output import text_from_rendered
+    except Exception as e:
+        raise gr.Error(
+            "Marker SDK is not available. Make sure `marker-pdf` is installed."
+        ) from e
+    try:
+        converter = PdfConverter(artifact_dict=create_model_dict())
+        rendered = converter(str(pdf_path))
+        text, _, _ = text_from_rendered(rendered)
+    except Exception as e:
+        raise gr.Error(f"PDF conversion failed with Marker SDK: {e}") from e
+    text = (text or "").strip()
+    if not text:
+        markdown_text = getattr(rendered, "markdown", "") if rendered is not None else ""
+        text = (markdown_text or "").strip()
+    if not text:
+        raise gr.Error("Marker SDK produced empty output for this PDF.")
+    return text
+def _build_paper_source_from_upload(uploaded_path: str) -> str:
+    src_path = Path(uploaded_path)
+    file_name = src_path.name.lower()
+    if _is_relevant_text_file(src_path):
+        return _read_text_file(src_path)
+    with tempfile.TemporaryDirectory(prefix="paper_extract_") as tmpdir:
+        tmp_root = Path(tmpdir)
+        extract_root = tmp_root / "extracted"
+        extract_root.mkdir(parents=True, exist_ok=True)
+        if file_name.endswith(".zip"):
+            _safe_extract_zip(src_path, extract_root)
+            return _archive_to_tagged_source(extract_root)
+        if file_name.endswith(".tar.gz") or file_name.endswith(".tgz") or file_name.endswith(".tar"):
+            _safe_extract_tar(src_path, extract_root)
+            return _archive_to_tagged_source(extract_root)
+        if file_name.endswith(".gz") or file_name.endswith(".gzip"):
+            # Handle compressed tar archives first.
+            if tarfile.is_tarfile(src_path):
+                _safe_extract_tar(src_path, extract_root)
+                return _archive_to_tagged_source(extract_root)
+            output_name = src_path.name
+            if output_name.endswith(".gzip"):
+                output_name = output_name[: -len(".gzip")]
+            elif output_name.endswith(".gz"):
+                output_name = output_name[: -len(".gz")]
+            decompressed_path = extract_root / output_name
+            with gzip.open(src_path, "rb") as gz_in, open(decompressed_path, "wb") as out_f:
+                out_f.write(gz_in.read())
+            if _is_relevant_text_file(decompressed_path):
+                return _read_text_file(decompressed_path)
+            raise gr.Error(
+                "Unsupported .gz/.gzip payload. It must contain a relevant text file or a tar archive."
+            )
+        if file_name.endswith(".pdf"):
+            return _convert_pdf_to_markdown(src_path)
+    raise gr.Error(
+        "Unsupported file type. Use text files (.tex/.text/.txt/.bib/.bbl/.md), "
+        "archives (.zip/.tar/.tar.gz/.tgz/.gz/.gzip), or .pdf."
+    )
+def get_all_marker_models():
+    try:
+        from marker.models import create_model_dict
+        create_model_dict()
+    except Exception as e:
+        raise gr.Error(
+            "Marker SDK is not available. Make sure `marker-pdf` is installed."
+        ) from e
 def run_single_interaction(
     message_input,
     history,
     ablation_mode,
         raise gr.Error("This run is complete. Click Restart to begin a new interaction.")
     prior_history = _sanitize_history(history)
     text = ""
     files = []
         raise gr.Error("Please sign in with Hugging Face before submitting.")
     file_label = None
+    file_path = None
     if has_file:
         file_item = files[0]
         file_path = file_item.get("path") if isinstance(file_item, dict) else file_item
         file_label = os.path.basename(file_path) if file_path else "uploaded_file"
+    paper_source = text if has_text else _build_paper_source_from_upload(file_path)
     user_prompt_template = prompts["user_prompt"]
     user_content = (
+        user_prompt_template.replace("{{paper_source}}", paper_source)
         .replace("{{num_ablations}}", str(num_ablations))
     )
+    user_display = f"Planning {num_ablations} ablations from submitted paper."
     client = InferenceClient(
         token=hf_token.token,
         )
     )
+    done_status = "Ablation plan complete. Click Restart to run another one."
     emitted = False
     raw_output = ""
     predictions_message_idx = None
             True,
         )
+def print_like_dislike(x: gr.LikeData):
+    print(x.index, x.value, x.liked)
 def change_ablation_mode(
     ablation_mode,
 def restart_interaction():
     return (
         [],
+        "Ready. Submit your paper.",
         False,
     )
     }
     """
 ) as demo:
+    demo.load(get_all_marker_models)
     gr.Markdown(
         """
         # Ablation Bench
     status_text = gr.Markdown("Ready. Submit text or a single file.")
     restart_btn = gr.Button("↺")
+    chatbot = gr.Chatbot(
+        label="Ablation Plan",
+        buttons=[restart_btn, "copy"],
     )
+    interaction_locked = gr.State(False)
     message_input = gr.MultimodalTextbox(
         label="Paper content",
         placeholder="Paste your paper content here or upload a single PDF/MD/TEX file or a single zip/gzip file of your paper.",
+        lines=5,
         file_count="single",
+        file_types=[
+            "text",
+            ".tex",
+            ".text",
+            ".txt",
+            ".bib",
+            ".bbl",
+            ".md",
+            ".zip",
+            ".tar",
+            ".tar.gz",
+            ".tgz",
+            ".gz",
+            ".gzip",
+            ".pdf",
+        ],
         max_lines=1000,
     )
     message_input.submit(
         run_single_interaction,
         inputs=[
             message_input,
             chatbot,
             ablation_mode,
         ],
     )
+    chatbot.clear(
+        restart_interaction,
+        outputs=[
+            chatbot,
+            status_text,
+            interaction_locked,
+        ]
+    )
     ablation_mode.input(
         change_ablation_mode,
         inputs=[
             num_ablations,
         ]
     )
+    chatbot.like(print_like_dislike)
 if __name__ == "__main__":
     demo.launch()

prompts.yaml CHANGED Viewed

@@ -23,11 +23,7 @@ author_ablation:
     {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
     </predictions>
   user_prompt: |-
-    We're currently want to suggest ablation studies for the research titled {{paper_title}}. Here's the research abstract:
-    ABSTRACT:
-    {{problem_statement}}
-    The paper source is provided below, after all of the instructions.
     INSTRUCTIONS:
     Now, you're going to suggest UP TO {{num_ablations}} ablation studies on your own, in a JSONL format.
@@ -85,11 +81,7 @@ reviewer_ablation:
     {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
     </predictions>
   user_prompt: |-
-    We're currently want to suggest missing ablation studies for the research titled {{paper_title}}. Here's the research abstract:
-    ABSTRACT:
-    {{problem_statement}}
-    The paper source is provided below, after all of the instructions.
     INSTRUCTIONS:
     Now, you're going to suggest UP TO {{num_ablations}} missing ablation studies in the given paper on your own, in a JSONL format.
@@ -128,4 +120,4 @@ reviewer_ablation:
     <paper_source>
     {{paper_source}}
-    </paper_source>

     {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
     </predictions>
   user_prompt: |-
+    We're currently want to suggest ablation studies for the following research paper source.
     INSTRUCTIONS:
     Now, you're going to suggest UP TO {{num_ablations}} ablation studies on your own, in a JSONL format.
     {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
     </predictions>
   user_prompt: |-
+    We're currently want to suggest missing ablation studies for the following research paper source.
     INSTRUCTIONS:
     Now, you're going to suggest UP TO {{num_ablations}} missing ablation studies in the given paper on your own, in a JSONL format.
     <paper_source>
     {{paper_source}}
+    </paper_source>

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- pytz


1	+ pytz
2	+ marker-pdf