Talor Abramovich commited on
Commit
78a366f
·
1 Parent(s): 3335d02

application improved without binary assets

Browse files
Files changed (3) hide show
  1. app.py +182 -31
  2. prompts.yaml +3 -11
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,12 +1,18 @@
1
  import os
2
  import json
3
  import yaml
 
 
 
 
4
  from pathlib import Path
5
 
6
  import gradio as gr
7
  from huggingface_hub import InferenceClient
8
  from huggingface_hub.errors import BadRequestError
9
 
 
 
10
 
11
  def _normalize_message_content(content):
12
  if isinstance(content, list):
@@ -107,9 +113,142 @@ def _format_predictions_markdown(items):
107
  return "\n".join(out).strip("- \n")
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def run_single_interaction(
111
- paper_title,
112
- paper_abstract,
113
  message_input,
114
  history,
115
  ablation_mode,
@@ -132,12 +271,6 @@ def run_single_interaction(
132
  raise gr.Error("This run is complete. Click Restart to begin a new interaction.")
133
 
134
  prior_history = _sanitize_history(history)
135
- paper_title = (paper_title or "").strip()
136
- paper_abstract = (paper_abstract or "").strip()
137
- if not paper_title:
138
- raise gr.Error("Please add a paper title before submitting.")
139
- if not paper_abstract:
140
- raise gr.Error("Please add a paper abstract before submitting.")
141
 
142
  text = ""
143
  files = []
@@ -160,20 +293,19 @@ def run_single_interaction(
160
  raise gr.Error("Please sign in with Hugging Face before submitting.")
161
 
162
  file_label = None
 
163
  if has_file:
164
  file_item = files[0]
165
  file_path = file_item.get("path") if isinstance(file_item, dict) else file_item
166
  file_label = os.path.basename(file_path) if file_path else "uploaded_file"
167
 
168
- paper_source = text if has_text else f"[uploaded file: {file_label}]"
169
  user_prompt_template = prompts["user_prompt"]
170
  user_content = (
171
- user_prompt_template.replace("{{paper_title}}", paper_title)
172
- .replace("{{problem_statement}}", paper_abstract)
173
- .replace("{{paper_source}}", paper_source)
174
  .replace("{{num_ablations}}", str(num_ablations))
175
  )
176
- user_display = f"Planning {num_ablations} ablations for paper: {paper_title}"
177
 
178
  client = InferenceClient(
179
  token=hf_token.token,
@@ -194,7 +326,7 @@ def run_single_interaction(
194
  )
195
  )
196
 
197
- done_status = "Interaction complete. Click Restart to run another one."
198
  emitted = False
199
  raw_output = ""
200
  predictions_message_idx = None
@@ -285,6 +417,9 @@ def run_single_interaction(
285
  True,
286
  )
287
 
 
 
 
288
 
289
  def change_ablation_mode(
290
  ablation_mode,
@@ -302,7 +437,7 @@ def change_ablation_mode(
302
  def restart_interaction():
303
  return (
304
  [],
305
- "Ready. Submit text or a single file.",
306
  False,
307
  )
308
 
@@ -329,6 +464,7 @@ with gr.Blocks(
329
  }
330
  """
331
  ) as demo:
 
332
  gr.Markdown(
333
  """
334
  # Ablation Bench
@@ -347,25 +483,33 @@ with gr.Blocks(
347
 
348
  status_text = gr.Markdown("Ready. Submit text or a single file.")
349
  restart_btn = gr.Button("↺")
350
- chatbot = gr.Chatbot(label="Ablation Plan", buttons=[restart_btn])
351
- interaction_locked = gr.State(False)
352
-
353
- paper_title = gr.Textbox(
354
- label="Paper title",
355
- placeholder="Enter paper title...",
356
- )
357
- paper_abstract = gr.Textbox(
358
- label="Paper abstract",
359
- placeholder="Enter paper abstract...",
360
- lines=5,
361
  )
 
362
 
363
  message_input = gr.MultimodalTextbox(
364
  label="Paper content",
365
  placeholder="Paste your paper content here or upload a single PDF/MD/TEX file or a single zip/gzip file of your paper.",
366
- lines=10,
367
  file_count="single",
368
- file_types=["text", ".zip", ".gz", ".pdf", ".md", ".tex"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  max_lines=1000,
370
  )
371
 
@@ -442,8 +586,6 @@ with gr.Blocks(
442
  message_input.submit(
443
  run_single_interaction,
444
  inputs=[
445
- paper_title,
446
- paper_abstract,
447
  message_input,
448
  chatbot,
449
  ablation_mode,
@@ -470,6 +612,15 @@ with gr.Blocks(
470
  ],
471
  )
472
 
 
 
 
 
 
 
 
 
 
473
  ablation_mode.input(
474
  change_ablation_mode,
475
  inputs=[
@@ -479,7 +630,7 @@ with gr.Blocks(
479
  num_ablations,
480
  ]
481
  )
482
-
483
 
484
  if __name__ == "__main__":
485
  demo.launch()
 
1
  import os
2
  import json
3
  import yaml
4
+ import gzip
5
+ import tarfile
6
+ import zipfile
7
+ import tempfile
8
  from pathlib import Path
9
 
10
  import gradio as gr
11
  from huggingface_hub import InferenceClient
12
  from huggingface_hub.errors import BadRequestError
13
 
14
+ TEXT_EXTENSIONS = {".tex", ".text", ".txt", ".bib", ".bbl", ".md"}
15
+
16
 
17
  def _normalize_message_content(content):
18
  if isinstance(content, list):
 
113
  return "\n".join(out).strip("- \n")
114
 
115
 
116
+ def _read_text_file(path: Path) -> str:
117
+ return path.read_text(encoding="utf-8", errors="ignore")
118
+
119
+
120
+ def _is_relevant_text_file(path: Path) -> bool:
121
+ return path.suffix.lower() in TEXT_EXTENSIONS
122
+
123
+
124
+ def _safe_extract_zip(zip_path: Path, output_dir: Path) -> None:
125
+ with zipfile.ZipFile(zip_path, "r") as zf:
126
+ for member in zf.infolist():
127
+ member_path = output_dir / member.filename
128
+ resolved_member = member_path.resolve()
129
+ resolved_root = output_dir.resolve()
130
+ if not str(resolved_member).startswith(str(resolved_root)):
131
+ continue
132
+ zf.extract(member, output_dir)
133
+
134
+
135
+ def _safe_extract_tar(tar_path: Path, output_dir: Path) -> None:
136
+ with tarfile.open(tar_path, "r:*") as tf:
137
+ for member in tf.getmembers():
138
+ member_path = output_dir / member.name
139
+ resolved_member = member_path.resolve()
140
+ resolved_root = output_dir.resolve()
141
+ if not str(resolved_member).startswith(str(resolved_root)):
142
+ continue
143
+ tf.extract(member, output_dir)
144
+
145
+
146
+ def _archive_to_tagged_source(extracted_root: Path) -> str:
147
+ chunks = []
148
+ for file_path in sorted(extracted_root.rglob("*")):
149
+ if not file_path.is_file() or not _is_relevant_text_file(file_path):
150
+ continue
151
+ try:
152
+ relative_name = file_path.relative_to(extracted_root).as_posix()
153
+ file_text = _read_text_file(file_path)
154
+ except Exception:
155
+ continue
156
+ chunks.append(f'<file name="{relative_name}">\n{file_text}\n</file>\n')
157
+ if not chunks:
158
+ raise gr.Error(
159
+ "No relevant text files found in the archive. Expected .tex/.text/.txt/.bib/.bbl/.md files."
160
+ )
161
+ return "\n".join(chunks)
162
+
163
+
164
+ def _convert_pdf_to_markdown(pdf_path: Path) -> str:
165
+ try:
166
+ from marker.converters.pdf import PdfConverter
167
+ from marker.models import create_model_dict
168
+ from marker.output import text_from_rendered
169
+ except Exception as e:
170
+ raise gr.Error(
171
+ "Marker SDK is not available. Make sure `marker-pdf` is installed."
172
+ ) from e
173
+
174
+ try:
175
+ converter = PdfConverter(artifact_dict=create_model_dict())
176
+ rendered = converter(str(pdf_path))
177
+ text, _, _ = text_from_rendered(rendered)
178
+ except Exception as e:
179
+ raise gr.Error(f"PDF conversion failed with Marker SDK: {e}") from e
180
+
181
+ text = (text or "").strip()
182
+ if not text:
183
+ markdown_text = getattr(rendered, "markdown", "") if rendered is not None else ""
184
+ text = (markdown_text or "").strip()
185
+ if not text:
186
+ raise gr.Error("Marker SDK produced empty output for this PDF.")
187
+ return text
188
+
189
+
190
+ def _build_paper_source_from_upload(uploaded_path: str) -> str:
191
+ src_path = Path(uploaded_path)
192
+ file_name = src_path.name.lower()
193
+
194
+ if _is_relevant_text_file(src_path):
195
+ return _read_text_file(src_path)
196
+
197
+ with tempfile.TemporaryDirectory(prefix="paper_extract_") as tmpdir:
198
+ tmp_root = Path(tmpdir)
199
+ extract_root = tmp_root / "extracted"
200
+ extract_root.mkdir(parents=True, exist_ok=True)
201
+
202
+ if file_name.endswith(".zip"):
203
+ _safe_extract_zip(src_path, extract_root)
204
+ return _archive_to_tagged_source(extract_root)
205
+
206
+ if file_name.endswith(".tar.gz") or file_name.endswith(".tgz") or file_name.endswith(".tar"):
207
+ _safe_extract_tar(src_path, extract_root)
208
+ return _archive_to_tagged_source(extract_root)
209
+
210
+ if file_name.endswith(".gz") or file_name.endswith(".gzip"):
211
+ # Handle compressed tar archives first.
212
+ if tarfile.is_tarfile(src_path):
213
+ _safe_extract_tar(src_path, extract_root)
214
+ return _archive_to_tagged_source(extract_root)
215
+
216
+ output_name = src_path.name
217
+ if output_name.endswith(".gzip"):
218
+ output_name = output_name[: -len(".gzip")]
219
+ elif output_name.endswith(".gz"):
220
+ output_name = output_name[: -len(".gz")]
221
+ decompressed_path = extract_root / output_name
222
+
223
+ with gzip.open(src_path, "rb") as gz_in, open(decompressed_path, "wb") as out_f:
224
+ out_f.write(gz_in.read())
225
+
226
+ if _is_relevant_text_file(decompressed_path):
227
+ return _read_text_file(decompressed_path)
228
+ raise gr.Error(
229
+ "Unsupported .gz/.gzip payload. It must contain a relevant text file or a tar archive."
230
+ )
231
+
232
+ if file_name.endswith(".pdf"):
233
+ return _convert_pdf_to_markdown(src_path)
234
+
235
+ raise gr.Error(
236
+ "Unsupported file type. Use text files (.tex/.text/.txt/.bib/.bbl/.md), "
237
+ "archives (.zip/.tar/.tar.gz/.tgz/.gz/.gzip), or .pdf."
238
+ )
239
+
240
+
241
+ def get_all_marker_models():
242
+ try:
243
+ from marker.models import create_model_dict
244
+ create_model_dict()
245
+ except Exception as e:
246
+ raise gr.Error(
247
+ "Marker SDK is not available. Make sure `marker-pdf` is installed."
248
+ ) from e
249
+
250
+
251
  def run_single_interaction(
 
 
252
  message_input,
253
  history,
254
  ablation_mode,
 
271
  raise gr.Error("This run is complete. Click Restart to begin a new interaction.")
272
 
273
  prior_history = _sanitize_history(history)
 
 
 
 
 
 
274
 
275
  text = ""
276
  files = []
 
293
  raise gr.Error("Please sign in with Hugging Face before submitting.")
294
 
295
  file_label = None
296
+ file_path = None
297
  if has_file:
298
  file_item = files[0]
299
  file_path = file_item.get("path") if isinstance(file_item, dict) else file_item
300
  file_label = os.path.basename(file_path) if file_path else "uploaded_file"
301
 
302
+ paper_source = text if has_text else _build_paper_source_from_upload(file_path)
303
  user_prompt_template = prompts["user_prompt"]
304
  user_content = (
305
+ user_prompt_template.replace("{{paper_source}}", paper_source)
 
 
306
  .replace("{{num_ablations}}", str(num_ablations))
307
  )
308
+ user_display = f"Planning {num_ablations} ablations from submitted paper."
309
 
310
  client = InferenceClient(
311
  token=hf_token.token,
 
326
  )
327
  )
328
 
329
+ done_status = "Ablation plan complete. Click Restart to run another one."
330
  emitted = False
331
  raw_output = ""
332
  predictions_message_idx = None
 
417
  True,
418
  )
419
 
420
+ def print_like_dislike(x: gr.LikeData):
421
+ print(x.index, x.value, x.liked)
422
+
423
 
424
  def change_ablation_mode(
425
  ablation_mode,
 
437
  def restart_interaction():
438
  return (
439
  [],
440
+ "Ready. Submit your paper.",
441
  False,
442
  )
443
 
 
464
  }
465
  """
466
  ) as demo:
467
+ demo.load(get_all_marker_models)
468
  gr.Markdown(
469
  """
470
  # Ablation Bench
 
483
 
484
  status_text = gr.Markdown("Ready. Submit text or a single file.")
485
  restart_btn = gr.Button("↺")
486
+ chatbot = gr.Chatbot(
487
+ label="Ablation Plan",
488
+ buttons=[restart_btn, "copy"],
 
 
 
 
 
 
 
 
489
  )
490
+ interaction_locked = gr.State(False)
491
 
492
  message_input = gr.MultimodalTextbox(
493
  label="Paper content",
494
  placeholder="Paste your paper content here or upload a single PDF/MD/TEX file or a single zip/gzip file of your paper.",
495
+ lines=5,
496
  file_count="single",
497
+ file_types=[
498
+ "text",
499
+ ".tex",
500
+ ".text",
501
+ ".txt",
502
+ ".bib",
503
+ ".bbl",
504
+ ".md",
505
+ ".zip",
506
+ ".tar",
507
+ ".tar.gz",
508
+ ".tgz",
509
+ ".gz",
510
+ ".gzip",
511
+ ".pdf",
512
+ ],
513
  max_lines=1000,
514
  )
515
 
 
586
  message_input.submit(
587
  run_single_interaction,
588
  inputs=[
 
 
589
  message_input,
590
  chatbot,
591
  ablation_mode,
 
612
  ],
613
  )
614
 
615
+ chatbot.clear(
616
+ restart_interaction,
617
+ outputs=[
618
+ chatbot,
619
+ status_text,
620
+ interaction_locked,
621
+ ]
622
+ )
623
+
624
  ablation_mode.input(
625
  change_ablation_mode,
626
  inputs=[
 
630
  num_ablations,
631
  ]
632
  )
633
+ chatbot.like(print_like_dislike)
634
 
635
  if __name__ == "__main__":
636
  demo.launch()
prompts.yaml CHANGED
@@ -23,11 +23,7 @@ author_ablation:
23
  {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
24
  </predictions>
25
  user_prompt: |-
26
- We're currently want to suggest ablation studies for the research titled {{paper_title}}. Here's the research abstract:
27
- ABSTRACT:
28
- {{problem_statement}}
29
-
30
- The paper source is provided below, after all of the instructions.
31
 
32
  INSTRUCTIONS:
33
  Now, you're going to suggest UP TO {{num_ablations}} ablation studies on your own, in a JSONL format.
@@ -85,11 +81,7 @@ reviewer_ablation:
85
  {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
86
  </predictions>
87
  user_prompt: |-
88
- We're currently want to suggest missing ablation studies for the research titled {{paper_title}}. Here's the research abstract:
89
- ABSTRACT:
90
- {{problem_statement}}
91
-
92
- The paper source is provided below, after all of the instructions.
93
 
94
  INSTRUCTIONS:
95
  Now, you're going to suggest UP TO {{num_ablations}} missing ablation studies in the given paper on your own, in a JSONL format.
@@ -128,4 +120,4 @@ reviewer_ablation:
128
 
129
  <paper_source>
130
  {{paper_source}}
131
- </paper_source>
 
23
  {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
24
  </predictions>
25
  user_prompt: |-
26
+ We're currently want to suggest ablation studies for the following research paper source.
 
 
 
 
27
 
28
  INSTRUCTIONS:
29
  Now, you're going to suggest UP TO {{num_ablations}} ablation studies on your own, in a JSONL format.
 
81
  {"name": "Ablation B", "ablated_part": "description of the ablated part", "action": "REPLACE", "replacement": ["replacement1", "replacement2"], "metrics": ["metric3"]}
82
  </predictions>
83
  user_prompt: |-
84
+ We're currently want to suggest missing ablation studies for the following research paper source.
 
 
 
 
85
 
86
  INSTRUCTIONS:
87
  Now, you're going to suggest UP TO {{num_ablations}} missing ablation studies in the given paper on your own, in a JSONL format.
 
120
 
121
  <paper_source>
122
  {{paper_source}}
123
+ </paper_source>
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- pytz
 
 
1
+ pytz
2
+ marker-pdf