Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files- .gitattributes +7 -0
- README.md +10 -6
- app.py +167 -0
- assets/.DS_Store +0 -0
- assets/example/docstructbench_llm-raw-scihub-o.O-ijc.22994.pdf_3_5.png +3 -0
- assets/example/table_photo_chn_35.png +3 -0
- assets/example/table_photo_eng_23.png +3 -0
- assets/example/table_scan_chn_1.png +3 -0
- assets/example/table_scan_chn_37.png +3 -0
- assets/example/table_scan_eng_12.png +3 -0
- header.html +119 -0
- otsl_utils.py +413 -0
- requirements.txt +24 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/example/llm-raw-the-eye-o.O-1995_2418.pdf_1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/example/docstructbench_llm-raw-scihub-o.O-ijc.22994.pdf_3_5.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/example/table_photo_chn_35.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/example/table_photo_eng_23.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/example/table_scan_chn_1.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
assets/example/table_scan_chn_37.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
assets/example/table_scan_eng_12.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
---
|
| 2 |
-
title: TRivia
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
| 1 |
---
|
| 2 |
+
title: TRivia-3B
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.9.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: Demo for TRivia
|
| 12 |
+
models:
|
| 13 |
+
- opendatalab/TRivia-3B
|
| 14 |
---
|
| 15 |
|
| 16 |
+
https://arxiv.org/abs/2512.01248
|
app.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["GRADIO_TEMP_DIR"] = "./tmp"
|
| 3 |
+
|
| 4 |
+
import time
|
| 5 |
+
import torch
|
| 6 |
+
import spaces
|
| 7 |
+
import tempfile
|
| 8 |
+
import sys
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from io import StringIO
|
| 11 |
+
from contextlib import contextmanager
|
| 12 |
+
from threading import Thread
|
| 13 |
+
from PIL import Image
|
| 14 |
+
from transformers import (
|
| 15 |
+
AutoProcessor,
|
| 16 |
+
AutoModelForCausalLM,
|
| 17 |
+
AutoModel,
|
| 18 |
+
AutoTokenizer,
|
| 19 |
+
Qwen2_5_VLForConditionalGeneration,
|
| 20 |
+
TextIteratorStreamer
|
| 21 |
+
)
|
| 22 |
+
from huggingface_hub import snapshot_download
|
| 23 |
+
from qwen_vl_utils import process_vision_info
|
| 24 |
+
from otsl_utils import convert_otsl_to_html
|
| 25 |
+
|
| 26 |
+
# == download weights ==
|
| 27 |
+
# model_dir = snapshot_download('opendatalab/TRivia-3B', local_dir='./models/TRivia-3B')
|
| 28 |
+
# == select device ==
|
| 29 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 30 |
+
|
| 31 |
+
# Load TRivia-3B
|
| 32 |
+
try:
|
| 33 |
+
MODEL_ID = "opendatalab/TRivia-3B"
|
| 34 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 35 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 36 |
+
MODEL_ID,
|
| 37 |
+
trust_remote_code=True,
|
| 38 |
+
torch_dtype=torch.float16,
|
| 39 |
+
device_map="auto"
|
| 40 |
+
).eval()
|
| 41 |
+
print("✓ TRivia-3B loaded")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
model = None
|
| 44 |
+
processor = None
|
| 45 |
+
|
| 46 |
+
@spaces.GPU
|
| 47 |
+
def recognize_image(image: Image.Image,
|
| 48 |
+
max_new_tokens: int, temperature: float):
|
| 49 |
+
if image is None:
|
| 50 |
+
yield "Please upload an image.", "Please upload an image."
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
# Prepare messages in chat format
|
| 55 |
+
messages = [{
|
| 56 |
+
"role": "user",
|
| 57 |
+
"content": [
|
| 58 |
+
{"type": "text", "text": "You are an AI specialized in recognizing and extracting table from images. Your mission is to analyze the table image and generate the result in OTSL format using specified tags. Output only the results without any other words and explanation."},
|
| 59 |
+
{"type": "image"},
|
| 60 |
+
]
|
| 61 |
+
}]
|
| 62 |
+
|
| 63 |
+
prompt_full = processor.apply_chat_template(
|
| 64 |
+
messages,
|
| 65 |
+
tokenize=False,
|
| 66 |
+
add_generation_prompt=True
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
inputs = processor(
|
| 70 |
+
text=[prompt_full],
|
| 71 |
+
images=[image],
|
| 72 |
+
return_tensors="pt",
|
| 73 |
+
padding=True
|
| 74 |
+
).to(device)
|
| 75 |
+
|
| 76 |
+
streamer = TextIteratorStreamer(
|
| 77 |
+
processor.tokenizer if hasattr(processor, 'tokenizer') else processor,
|
| 78 |
+
skip_prompt=True,
|
| 79 |
+
skip_special_tokens=True
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
generation_kwargs = {
|
| 84 |
+
**inputs,
|
| 85 |
+
"streamer": streamer,
|
| 86 |
+
"max_new_tokens": max_new_tokens,
|
| 87 |
+
"do_sample": True,
|
| 88 |
+
"temperature": temperature,
|
| 89 |
+
"repetition_penalty": 1.05,
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 94 |
+
thread.start()
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# Stream the results
|
| 98 |
+
buffer = ""
|
| 99 |
+
for new_text in streamer:
|
| 100 |
+
buffer += new_text
|
| 101 |
+
buffer = buffer.replace("<|im_end|>", "")
|
| 102 |
+
html_text = convert_otsl_to_html(buffer)
|
| 103 |
+
time.sleep(0.01)
|
| 104 |
+
yield buffer, html_text, html_text
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# Ensure thread completes
|
| 108 |
+
thread.join()
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
error_msg = f"Error during generation: {str(e)}"
|
| 113 |
+
print(f"Full error: {e}")
|
| 114 |
+
import traceback
|
| 115 |
+
traceback.print_exc()
|
| 116 |
+
yield error_msg, error_msg, error_msg
|
| 117 |
+
|
| 118 |
+
def gradio_reset():
|
| 119 |
+
return gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
with open("header.html", "r") as file:
|
| 124 |
+
header = file.read()
|
| 125 |
+
with gr.Blocks() as demo:
|
| 126 |
+
gr.HTML(header)
|
| 127 |
+
|
| 128 |
+
with gr.Row():
|
| 129 |
+
with gr.Column():
|
| 130 |
+
|
| 131 |
+
input_img = gr.Image(label=" ", interactive=True)
|
| 132 |
+
with gr.Row():
|
| 133 |
+
clear = gr.Button(value="Clear")
|
| 134 |
+
predict = gr.Button(value="Table Recognition", interactive=True, variant="primary")
|
| 135 |
+
|
| 136 |
+
with gr.Accordion("Advanced Settings", open=False):
|
| 137 |
+
max_tokens = gr.Slider(
|
| 138 |
+
minimum=1,
|
| 139 |
+
maximum=8192,
|
| 140 |
+
value=4096,
|
| 141 |
+
step=1,
|
| 142 |
+
label="Max New Tokens"
|
| 143 |
+
)
|
| 144 |
+
temperature = gr.Slider(
|
| 145 |
+
minimum=0.1,
|
| 146 |
+
maximum=2.0,
|
| 147 |
+
value=0.1,
|
| 148 |
+
step=0.1,
|
| 149 |
+
label="Temperature"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
with gr.Accordion("Examples:"):
|
| 153 |
+
example_root = os.path.join(os.path.dirname(__file__), "assets", "example")
|
| 154 |
+
gr.Examples(
|
| 155 |
+
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
| 156 |
+
_.endswith("png")],
|
| 157 |
+
inputs=[input_img],
|
| 158 |
+
)
|
| 159 |
+
with gr.Column():
|
| 160 |
+
rendered_html = gr.Markdown(label="Rendered HTML:", show_label=True)
|
| 161 |
+
output_html = gr.Textbox(label="Converted HTML:", interactive=False)
|
| 162 |
+
pred_otsl = gr.Textbox(label="Predicted OTSL:", interactive=False)
|
| 163 |
+
|
| 164 |
+
clear.click(gradio_reset, inputs=None, outputs=[input_img, pred_otsl, output_html, rendered_html])
|
| 165 |
+
predict.click(recognize_image, inputs=[input_img, max_tokens, temperature], outputs=[pred_otsl, output_html, rendered_html])
|
| 166 |
+
|
| 167 |
+
demo.launch(server_name="0.0.0.0", server_port=10041, debug=True)
|
assets/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
assets/example/docstructbench_llm-raw-scihub-o.O-ijc.22994.pdf_3_5.png
ADDED
|
Git LFS Details
|
assets/example/table_photo_chn_35.png
ADDED
|
Git LFS Details
|
assets/example/table_photo_eng_23.png
ADDED
|
Git LFS Details
|
assets/example/table_scan_chn_1.png
ADDED
|
Git LFS Details
|
assets/example/table_scan_chn_37.png
ADDED
|
Git LFS Details
|
assets/example/table_scan_eng_12.png
ADDED
|
Git LFS Details
|
header.html
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<html><head>
|
| 2 |
+
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"> -->
|
| 3 |
+
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
| 4 |
+
<style>
|
| 5 |
+
.link-block {
|
| 6 |
+
border: 1px solid transparent;
|
| 7 |
+
border-radius: 24px;
|
| 8 |
+
background-color: rgba(54, 54, 54, 1);
|
| 9 |
+
cursor: pointer !important;
|
| 10 |
+
}
|
| 11 |
+
.link-block:hover {
|
| 12 |
+
background-color: rgba(54, 54, 54, 0.75) !important;
|
| 13 |
+
cursor: pointer !important;
|
| 14 |
+
}
|
| 15 |
+
.external-link {
|
| 16 |
+
display: inline-flex;
|
| 17 |
+
align-items: center;
|
| 18 |
+
height: 36px;
|
| 19 |
+
line-height: 36px;
|
| 20 |
+
padding: 0 16px;
|
| 21 |
+
cursor: pointer !important;
|
| 22 |
+
}
|
| 23 |
+
.external-link,
|
| 24 |
+
.external-link:hover {
|
| 25 |
+
cursor: pointer !important;
|
| 26 |
+
}
|
| 27 |
+
a {
|
| 28 |
+
text-decoration: none;
|
| 29 |
+
}
|
| 30 |
+
</style></head>
|
| 31 |
+
|
| 32 |
+
<body>
|
| 33 |
+
<div style="
|
| 34 |
+
display: flex;
|
| 35 |
+
flex-direction: column;
|
| 36 |
+
justify-content: center;
|
| 37 |
+
align-items: center;
|
| 38 |
+
text-align: center;
|
| 39 |
+
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
| 40 |
+
padding: 24px;
|
| 41 |
+
gap: 24px;
|
| 42 |
+
border-radius: 8px;
|
| 43 |
+
">
|
| 44 |
+
<div style="
|
| 45 |
+
display: flex;
|
| 46 |
+
flex-direction: column;
|
| 47 |
+
align-items: center;
|
| 48 |
+
gap: 16px;
|
| 49 |
+
">
|
| 50 |
+
<div style="display: flex; flex-direction: column; gap: 8px">
|
| 51 |
+
<h1 style="
|
| 52 |
+
font-size: 48px;
|
| 53 |
+
color: #fafafa;
|
| 54 |
+
margin: 0;
|
| 55 |
+
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
| 56 |
+
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
| 57 |
+
">
|
| 58 |
+
TRivia-3B: Demo
|
| 59 |
+
</h1>
|
| 60 |
+
</div>
|
| 61 |
+
</div>
|
| 62 |
+
|
| 63 |
+
<p style="
|
| 64 |
+
margin: 0;
|
| 65 |
+
line-height: 1.6rem;
|
| 66 |
+
font-size: 16px;
|
| 67 |
+
color: #fafafa;
|
| 68 |
+
opacity: 0.8;
|
| 69 |
+
">
|
| 70 |
+
Self-supervised Fine-tuning of Vision-Language Models for Table Recognition.<br>
|
| 71 |
+
</p>
|
| 72 |
+
<style>
|
| 73 |
+
.link-block {
|
| 74 |
+
display: inline-block;
|
| 75 |
+
}
|
| 76 |
+
.link-block + .link-block {
|
| 77 |
+
margin-left: 20px;
|
| 78 |
+
}
|
| 79 |
+
</style>
|
| 80 |
+
|
| 81 |
+
<div class="column has-text-centered">
|
| 82 |
+
<div class="publication-links">
|
| 83 |
+
<!-- Code Link. -->
|
| 84 |
+
<span class="link-block">
|
| 85 |
+
<a href="https://github.com/opendatalab/TRivia" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 86 |
+
<span class="icon" style="margin-right: 4px">
|
| 87 |
+
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
| 88 |
+
</span>
|
| 89 |
+
<span style="color: white">Code</span>
|
| 90 |
+
</a>
|
| 91 |
+
</span>
|
| 92 |
+
|
| 93 |
+
<!-- Code Link. -->
|
| 94 |
+
<span class="link-block">
|
| 95 |
+
<a href="https://huggingface.co/opendatalab/TRivia-3B" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 96 |
+
<span class="icon" style="margin-right: 4px">
|
| 97 |
+
<i class="fas fa-archive" style="color: white; margin-right: 4px"></i>
|
| 98 |
+
</span>
|
| 99 |
+
<span style="color: white">Code</span>
|
| 100 |
+
</a>
|
| 101 |
+
</span>
|
| 102 |
+
|
| 103 |
+
<!-- Paper Link. -->
|
| 104 |
+
<span class="link-block">
|
| 105 |
+
<a href="https://arxiv.org/abs/2512.01248" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 106 |
+
<span class="icon" style="margin-right: 8px">
|
| 107 |
+
<i class="fas fa-file" style="color: white"></i>
|
| 108 |
+
</span>
|
| 109 |
+
<span style="color: white">Paper</span>
|
| 110 |
+
</a>
|
| 111 |
+
</span>
|
| 112 |
+
</div>
|
| 113 |
+
</div>
|
| 114 |
+
|
| 115 |
+
<!-- New Demo Links -->
|
| 116 |
+
</div>
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
</body></html>
|
otsl_utils.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import itertools
|
| 3 |
+
import html
|
| 4 |
+
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
| 5 |
+
from pydantic import (
|
| 6 |
+
AnyUrl,
|
| 7 |
+
BaseModel,
|
| 8 |
+
ConfigDict,
|
| 9 |
+
Field,
|
| 10 |
+
StringConstraints,
|
| 11 |
+
computed_field,
|
| 12 |
+
field_validator,
|
| 13 |
+
model_validator,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
class TableCell(BaseModel):
|
| 17 |
+
"""TableCell."""
|
| 18 |
+
row_span: int = 1
|
| 19 |
+
col_span: int = 1
|
| 20 |
+
start_row_offset_idx: int
|
| 21 |
+
end_row_offset_idx: int
|
| 22 |
+
start_col_offset_idx: int
|
| 23 |
+
end_col_offset_idx: int
|
| 24 |
+
text: str
|
| 25 |
+
column_header: bool = False
|
| 26 |
+
row_header: bool = False
|
| 27 |
+
row_section: bool = False
|
| 28 |
+
|
| 29 |
+
@model_validator(mode="before")
|
| 30 |
+
@classmethod
|
| 31 |
+
def from_dict_format(cls, data: Any) -> Any:
|
| 32 |
+
"""from_dict_format."""
|
| 33 |
+
if isinstance(data, Dict):
|
| 34 |
+
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
|
| 35 |
+
if (
|
| 36 |
+
# "bbox" not in data
|
| 37 |
+
# or data["bbox"] is None
|
| 38 |
+
# or isinstance(data["bbox"], BoundingBox)
|
| 39 |
+
"text"
|
| 40 |
+
in data
|
| 41 |
+
):
|
| 42 |
+
return data
|
| 43 |
+
text = data["bbox"].get("token", "")
|
| 44 |
+
if not len(text):
|
| 45 |
+
text_cells = data.pop("text_cell_bboxes", None)
|
| 46 |
+
if text_cells:
|
| 47 |
+
for el in text_cells:
|
| 48 |
+
text += el["token"] + " "
|
| 49 |
+
|
| 50 |
+
text = text.strip()
|
| 51 |
+
data["text"] = text
|
| 52 |
+
|
| 53 |
+
return data
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class TableData(BaseModel): # TBD
|
| 57 |
+
"""BaseTableData."""
|
| 58 |
+
|
| 59 |
+
table_cells: List[TableCell] = []
|
| 60 |
+
num_rows: int = 0
|
| 61 |
+
num_cols: int = 0
|
| 62 |
+
|
| 63 |
+
@computed_field # type: ignore
|
| 64 |
+
@property
|
| 65 |
+
def grid(
|
| 66 |
+
self,
|
| 67 |
+
) -> List[List[TableCell]]:
|
| 68 |
+
"""grid."""
|
| 69 |
+
# Initialise empty table data grid (only empty cells)
|
| 70 |
+
table_data = [
|
| 71 |
+
[
|
| 72 |
+
TableCell(
|
| 73 |
+
text="",
|
| 74 |
+
start_row_offset_idx=i,
|
| 75 |
+
end_row_offset_idx=i + 1,
|
| 76 |
+
start_col_offset_idx=j,
|
| 77 |
+
end_col_offset_idx=j + 1,
|
| 78 |
+
)
|
| 79 |
+
for j in range(self.num_cols)
|
| 80 |
+
]
|
| 81 |
+
for i in range(self.num_rows)
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
# Overwrite cells in table data for which there is actual cell content.
|
| 85 |
+
for cell in self.table_cells:
|
| 86 |
+
for i in range(
|
| 87 |
+
min(cell.start_row_offset_idx, self.num_rows),
|
| 88 |
+
min(cell.end_row_offset_idx, self.num_rows),
|
| 89 |
+
):
|
| 90 |
+
for j in range(
|
| 91 |
+
min(cell.start_col_offset_idx, self.num_cols),
|
| 92 |
+
min(cell.end_col_offset_idx, self.num_cols),
|
| 93 |
+
):
|
| 94 |
+
table_data[i][j] = cell
|
| 95 |
+
|
| 96 |
+
return table_data
|
| 97 |
+
|
| 98 |
+
"""
|
| 99 |
+
OTSL
|
| 100 |
+
"""
|
| 101 |
+
OTSL_NL = "<nl>"
|
| 102 |
+
OTSL_FCEL = "<fcel>"
|
| 103 |
+
OTSL_ECEL = "<ecel>"
|
| 104 |
+
OTSL_LCEL = "<lcel>"
|
| 105 |
+
OTSL_UCEL = "<ucel>"
|
| 106 |
+
OTSL_XCEL = "<xcel>"
|
| 107 |
+
|
| 108 |
+
def otsl_extract_tokens_and_text(s: str):
|
| 109 |
+
# Pattern to match anything enclosed by < >
|
| 110 |
+
# (including the angle brackets themselves)
|
| 111 |
+
# pattern = r"(<[^>]+>)"
|
| 112 |
+
pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")"
|
| 113 |
+
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
| 114 |
+
tokens = re.findall(pattern, s)
|
| 115 |
+
# Remove any tokens that start with "<loc_"
|
| 116 |
+
tokens = [token for token in tokens]
|
| 117 |
+
# Split the string by those tokens to get the in-between text
|
| 118 |
+
text_parts = re.split(pattern, s)
|
| 119 |
+
text_parts = [token for token in text_parts]
|
| 120 |
+
# Remove any empty or purely whitespace strings from text_parts
|
| 121 |
+
text_parts = [part for part in text_parts if part.strip()]
|
| 122 |
+
|
| 123 |
+
return tokens, text_parts
|
| 124 |
+
|
| 125 |
+
def otsl_parse_texts(texts, tokens):
|
| 126 |
+
split_word = OTSL_NL
|
| 127 |
+
split_row_tokens = [
|
| 128 |
+
list(y)
|
| 129 |
+
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
| 130 |
+
if not x
|
| 131 |
+
]
|
| 132 |
+
table_cells = []
|
| 133 |
+
r_idx = 0
|
| 134 |
+
c_idx = 0
|
| 135 |
+
|
| 136 |
+
# 检查并补充矩阵以使其完整
|
| 137 |
+
if split_row_tokens:
|
| 138 |
+
# 找到最大列数
|
| 139 |
+
max_cols = max(len(row) for row in split_row_tokens)
|
| 140 |
+
|
| 141 |
+
# 补充每一行使其达到最大列数
|
| 142 |
+
for row_idx, row in enumerate(split_row_tokens):
|
| 143 |
+
while len(row) < max_cols:
|
| 144 |
+
row.append(OTSL_ECEL)
|
| 145 |
+
|
| 146 |
+
# 在texts中也需要相应补充<ecel>
|
| 147 |
+
# 重新构建texts以包含补充的<ecel>
|
| 148 |
+
new_texts = []
|
| 149 |
+
text_idx = 0
|
| 150 |
+
|
| 151 |
+
for row_idx, row in enumerate(split_row_tokens):
|
| 152 |
+
for col_idx, token in enumerate(row):
|
| 153 |
+
new_texts.append(token)
|
| 154 |
+
# 如果这个token在原始texts中有对应的文本内容,添加它
|
| 155 |
+
if text_idx < len(texts) and texts[text_idx] == token:
|
| 156 |
+
text_idx += 1
|
| 157 |
+
# 检查下一个是否是��本内容(不是token)
|
| 158 |
+
if (text_idx < len(texts) and
|
| 159 |
+
texts[text_idx] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]):
|
| 160 |
+
new_texts.append(texts[text_idx])
|
| 161 |
+
text_idx += 1
|
| 162 |
+
|
| 163 |
+
new_texts.append(OTSL_NL)
|
| 164 |
+
if text_idx < len(texts) and texts[text_idx] == OTSL_NL:
|
| 165 |
+
text_idx += 1
|
| 166 |
+
|
| 167 |
+
texts = new_texts
|
| 168 |
+
|
| 169 |
+
def count_right(tokens, c_idx, r_idx, which_tokens):
|
| 170 |
+
span = 0
|
| 171 |
+
c_idx_iter = c_idx
|
| 172 |
+
while tokens[r_idx][c_idx_iter] in which_tokens:
|
| 173 |
+
c_idx_iter += 1
|
| 174 |
+
span += 1
|
| 175 |
+
if c_idx_iter >= len(tokens[r_idx]):
|
| 176 |
+
return span
|
| 177 |
+
return span
|
| 178 |
+
|
| 179 |
+
def count_down(tokens, c_idx, r_idx, which_tokens):
|
| 180 |
+
span = 0
|
| 181 |
+
r_idx_iter = r_idx
|
| 182 |
+
while tokens[r_idx_iter][c_idx] in which_tokens:
|
| 183 |
+
r_idx_iter += 1
|
| 184 |
+
span += 1
|
| 185 |
+
if r_idx_iter >= len(tokens):
|
| 186 |
+
return span
|
| 187 |
+
return span
|
| 188 |
+
|
| 189 |
+
for i, text in enumerate(texts):
|
| 190 |
+
cell_text = ""
|
| 191 |
+
if text in [
|
| 192 |
+
OTSL_FCEL,
|
| 193 |
+
OTSL_ECEL,
|
| 194 |
+
]:
|
| 195 |
+
row_span = 1
|
| 196 |
+
col_span = 1
|
| 197 |
+
right_offset = 1
|
| 198 |
+
if text != OTSL_ECEL and (texts[i + 1] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]):
|
| 199 |
+
cell_text = texts[i + 1]
|
| 200 |
+
right_offset = 2
|
| 201 |
+
|
| 202 |
+
# Check next element(s) for lcel / ucel / xcel,
|
| 203 |
+
# set properly row_span, col_span
|
| 204 |
+
next_right_cell = ""
|
| 205 |
+
if i + right_offset < len(texts):
|
| 206 |
+
next_right_cell = texts[i + right_offset]
|
| 207 |
+
|
| 208 |
+
next_bottom_cell = ""
|
| 209 |
+
if r_idx + 1 < len(split_row_tokens):
|
| 210 |
+
if c_idx < len(split_row_tokens[r_idx + 1]):
|
| 211 |
+
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
| 212 |
+
|
| 213 |
+
if next_right_cell in [
|
| 214 |
+
OTSL_LCEL,
|
| 215 |
+
OTSL_XCEL,
|
| 216 |
+
]:
|
| 217 |
+
# we have horisontal spanning cell or 2d spanning cell
|
| 218 |
+
col_span += count_right(
|
| 219 |
+
split_row_tokens,
|
| 220 |
+
c_idx + 1,
|
| 221 |
+
r_idx,
|
| 222 |
+
[OTSL_LCEL, OTSL_XCEL],
|
| 223 |
+
)
|
| 224 |
+
if next_bottom_cell in [
|
| 225 |
+
OTSL_UCEL,
|
| 226 |
+
OTSL_XCEL,
|
| 227 |
+
]:
|
| 228 |
+
# we have a vertical spanning cell or 2d spanning cell
|
| 229 |
+
row_span += count_down(
|
| 230 |
+
split_row_tokens,
|
| 231 |
+
c_idx,
|
| 232 |
+
r_idx + 1,
|
| 233 |
+
[OTSL_UCEL, OTSL_XCEL],
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
table_cells.append(
|
| 237 |
+
TableCell(
|
| 238 |
+
text=cell_text.strip(),
|
| 239 |
+
row_span=row_span,
|
| 240 |
+
col_span=col_span,
|
| 241 |
+
start_row_offset_idx=r_idx,
|
| 242 |
+
end_row_offset_idx=r_idx + row_span,
|
| 243 |
+
start_col_offset_idx=c_idx,
|
| 244 |
+
end_col_offset_idx=c_idx + col_span,
|
| 245 |
+
)
|
| 246 |
+
)
|
| 247 |
+
if text in [
|
| 248 |
+
OTSL_FCEL,
|
| 249 |
+
OTSL_ECEL,
|
| 250 |
+
OTSL_LCEL,
|
| 251 |
+
OTSL_UCEL,
|
| 252 |
+
OTSL_XCEL,
|
| 253 |
+
]:
|
| 254 |
+
c_idx += 1
|
| 255 |
+
if text == OTSL_NL:
|
| 256 |
+
r_idx += 1
|
| 257 |
+
c_idx = 0
|
| 258 |
+
return table_cells, split_row_tokens
|
| 259 |
+
|
| 260 |
+
# def export_to_html(table_data: TableData):
|
| 261 |
+
# nrows = table_data.num_rows
|
| 262 |
+
# ncols = table_data.num_cols
|
| 263 |
+
# if len(table_data.table_cells) == 0:
|
| 264 |
+
# return ""
|
| 265 |
+
|
| 266 |
+
# body = ""
|
| 267 |
+
# grid = table_data.grid
|
| 268 |
+
|
| 269 |
+
# for i in range(nrows):
|
| 270 |
+
# body += "<tr>"
|
| 271 |
+
# for j in range(ncols):
|
| 272 |
+
# cell: TableCell = grid[i][j]
|
| 273 |
+
|
| 274 |
+
# rowspan, rowstart = (
|
| 275 |
+
# cell.row_span,
|
| 276 |
+
# cell.start_row_offset_idx,
|
| 277 |
+
# )
|
| 278 |
+
# colspan, colstart = (
|
| 279 |
+
# cell.col_span,
|
| 280 |
+
# cell.start_col_offset_idx,
|
| 281 |
+
# )
|
| 282 |
+
|
| 283 |
+
# if rowstart != i:
|
| 284 |
+
# continue
|
| 285 |
+
# if colstart != j:
|
| 286 |
+
# continue
|
| 287 |
+
|
| 288 |
+
# content = html.escape(cell.text.strip())
|
| 289 |
+
# celltag = "td"
|
| 290 |
+
# if cell.column_header:
|
| 291 |
+
# celltag = "th"
|
| 292 |
+
|
| 293 |
+
# opening_tag = f"{celltag}"
|
| 294 |
+
# if rowspan > 1:
|
| 295 |
+
# opening_tag += f' rowspan="{rowspan}"'
|
| 296 |
+
# if colspan > 1:
|
| 297 |
+
# opening_tag += f' colspan="{colspan}"'
|
| 298 |
+
|
| 299 |
+
# body += f"<{opening_tag}>{content}</{celltag}>"
|
| 300 |
+
# body += "</tr>"
|
| 301 |
+
|
| 302 |
+
# # dir = get_text_direction(text)
|
| 303 |
+
# body = f"<table>{body}</table>"
|
| 304 |
+
|
| 305 |
+
# return body
|
| 306 |
+
|
| 307 |
+
def export_to_html(table_data: TableData) -> str:
|
| 308 |
+
nrows = table_data.num_rows
|
| 309 |
+
ncols = table_data.num_cols
|
| 310 |
+
# print(nrows, ncols)
|
| 311 |
+
|
| 312 |
+
if not table_data.table_cells:
|
| 313 |
+
return ""
|
| 314 |
+
|
| 315 |
+
current_grid = table_data.grid
|
| 316 |
+
|
| 317 |
+
html_str_list = []
|
| 318 |
+
|
| 319 |
+
for i in range(nrows):
|
| 320 |
+
html_str_list.append("<tr>")
|
| 321 |
+
for j in range(ncols):
|
| 322 |
+
cell: TableCell = current_grid[i][j]
|
| 323 |
+
|
| 324 |
+
if cell.start_row_offset_idx != i or cell.start_col_offset_idx != j:
|
| 325 |
+
continue
|
| 326 |
+
|
| 327 |
+
# content = html.escape(cell.text.strip())
|
| 328 |
+
content = cell.text.strip()
|
| 329 |
+
cell_tag_name = "th" if cell.column_header else "td"
|
| 330 |
+
|
| 331 |
+
opening_tag_parts = [f"<{cell_tag_name}"]
|
| 332 |
+
if cell.row_span > 1:
|
| 333 |
+
opening_tag_parts.append(f' rowspan="{cell.row_span}"')
|
| 334 |
+
if cell.col_span > 1:
|
| 335 |
+
opening_tag_parts.append(f' colspan="{cell.col_span}"')
|
| 336 |
+
opening_tag_parts.append(">")
|
| 337 |
+
opening_tag = "".join(opening_tag_parts)
|
| 338 |
+
|
| 339 |
+
html_str_list.append(f"{opening_tag}{content}</{cell_tag_name}>")
|
| 340 |
+
html_str_list.append("</tr>")
|
| 341 |
+
|
| 342 |
+
body_content = "".join(html_str_list)
|
| 343 |
+
return f"<table>{body_content}</table>"
|
| 344 |
+
|
| 345 |
+
def convert_otsl_to_html(otsl_content: str) -> str:
|
| 346 |
+
# if not otsl_content.endswith("<nl>\n"):
|
| 347 |
+
# return ""
|
| 348 |
+
|
| 349 |
+
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
| 350 |
+
|
| 351 |
+
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
| 352 |
+
|
| 353 |
+
table_data = TableData(
|
| 354 |
+
num_rows=len(split_row_tokens),
|
| 355 |
+
num_cols=(
|
| 356 |
+
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
| 357 |
+
),
|
| 358 |
+
table_cells=table_cells,
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
result = export_to_html(table_data)
|
| 362 |
+
|
| 363 |
+
return result
|
| 364 |
+
|
| 365 |
+
if __name__ == "__main__":
|
| 366 |
+
import time
|
| 367 |
+
|
| 368 |
+
# test
|
| 369 |
+
a = """
|
| 370 |
+
<fcel><nl>\n
|
| 371 |
+
<fcel><nl>\n"""
|
| 372 |
+
b = """<fcel>Reviewer<fcel>Representation<fcel>Consultant<fcel>Speaker's Bureau<fcel>Ownership/ Partnership/ Principal<fcel>Personal Research<fcel>Institutional, Organizational, or Other Financial Benefit<fcel>Expert Witness<nl>
|
| 373 |
+
<fcel>John E. Brush<fcel>Official Reviewer–ACCF Board of Trustees<fcel>● United Healthcare<fcel>None<fcel>None<fcel>None<fcel>● PROMETHEUS Payment (Board member)<fcel>None<nl>
|
| 374 |
+
<fcel>David P. Faxon<fcel>Official Reviewer–AHA<fcel>● Johnson & Johnson<fcel>None<fcel>● CULPRIT Trial (PI)*<fcel>None<fcel>● Circulation: Cardiovascular Interventions—Editor*<fcel>None<nl>
|
| 375 |
+
<ucel><ucel><ucel><ucel><fcel>● RIVA Medical<ucel><ucel><ucel><nl>
|
| 376 |
+
<fcel>Robert A. Harrington<fcel>Official Reviewer–AHA<fcel>● AstraZeneca*<fcel>None<fcel>None<fcel>● AstraZeneca<fcel>None<fcel>None<nl>
|
| 377 |
+
<ucel><ucel><fcel>● Baxter<ucel><ucel><fcel>● Baxter<ucel><ucel><nl>
|
| 378 |
+
<ucel><ucel><fcel>● CSL Behring<ucel><ucel><fcel>● Bristol-Myers Squibb*<ucel><ucel><nl>
|
| 379 |
+
<ucel><ucel><fcel>● Eli Lilly<ucel><ucel><fcel>● GlaxoSmithKline<ucel><ucel><nl>
|
| 380 |
+
<ucel><ucel><fcel>● Luiypold<ucel><ucel><fcel>● The Medicines Company<ucel><ucel><nl>
|
| 381 |
+
<ucel><ucel><fcel>● Merck<ucel><ucel><fcel>● Merck*<ucel><ucel><nl>
|
| 382 |
+
<ucel><ucel><fcel>● Novartis<ucel><ucel><fcel>● Portola*<ucel><ucel><nl>
|
| 383 |
+
<ucel><ucel><fcel>● Otsuka Maryland Research Institute<ucel><ucel><fcel>● Schering-Plough*<ucel><ucel><nl>
|
| 384 |
+
<ucel><ucel><fcel>● Regado<ucel><ucel><ucel><ucel><ucel><nl>
|
| 385 |
+
<ucel><ucel><fcel>● Sanofi-aventis<ucel><ucel><ucel><ucel><ucel><nl>
|
| 386 |
+
<ucel><ucel><fcel>● Schering-Plough*<ucel><ucel><ucel><ucel><ucel><nl>
|
| 387 |
+
<ucel><ucel><fcel>● WebMD*<ucel><ucel><ucel><ucel><ucel><nl>
|
| 388 |
+
<fcel>Judith S. Hochman<fcel>Official Reviewer–ACCF/AHA Task Force on Practice Guidelines<fcel>● BMS/Sanofi<fcel>None<fcel>None<fcel>● Johnson & Johnson/Bayer Healthcare AG (DSMB)<fcel>None<fcel>None<nl>
|
| 389 |
+
<ucel><ucel><fcel>● Eli Lilly<ucel><ucel><ucel><ucel><ucel><nl>
|
| 390 |
+
<ucel><ucel><fcel>● GlaxoSmithKline<ucel><ucel><ucel><ucel><ucel><nl>
|
| 391 |
+
<ucel><ucel><fcel>● Millennium Pharmaceuticals/ Schering-Plough<ucel><ucel><fcel>● Schering-Plough (TIMI 50) (DSMB)<ucel><ucel><nl>
|
| 392 |
+
<fcel>Rodney H. Zimmermann<fcel>Official Reviewer–ACCF Board of Governors<fcel>● AstraZeneca<fcel>● AstraZeneca<fcel>None<fcel>● AstraZeneca<fcel>None<fcel>None<nl>
|
| 393 |
+
<ucel><ucel><fcel>● Boehringer Ingelheim<fcel>● Merck-Frost<fcel>● Sanofi-aventis<ucel><fcel>● Sanofi-aventis<ucel><nl>
|
| 394 |
+
<ucel><ucel><fcel>● Bristol-Myers Squibb<fcel>● Servier<ucel><ucel><ucel><ucel><nl>
|
| 395 |
+
<ucel><ucel><fcel>● Medtronic<ucel><ucel><ucel><ucel><ucel><nl>
|
| 396 |
+
<ucel><ucel><fcel>● Sanofi-aventis<ucel><ucel><ucel><ucel><ucel><nl>
|
| 397 |
+
<ucel><ucel><fcel>● Schering-Plough<ucel><ucel><ucel><ucel><ucel><nl>
|
| 398 |
+
<fcel>Steven Brown<fcel>Organizational Reviewer–AAFP<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
|
| 399 |
+
<fcel>Joseph C. Cleveland<fcel>Organizational Reviewer–STS<fcel>● Baxter Biosurgery<fcel>None<fcel>None<fcel>None<fcel>● Heartware<fcel>None<nl>
|
| 400 |
+
<ucel><ucel><fcel>● Essential Pharmaceuticals<ucel><ucel><ucel><fcel>● Thoratec<ucel><nl>
|
| 401 |
+
<fcel>Wyatt Decker<fcel>Organizational Reviewer–ACEP<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
|
| 402 |
+
<fcel>Joseph A. de Gregorio<fcel>Organizational Reviewer–SCAI<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
|
| 403 |
+
<fcel>Deborah B. Diercks<fcel>Organizational Reviewer–ACEP<fcel>● AstraZeneca<fcel>None<fcel>None<fcel>None<fcel>● Society of Chest Pain Centers and Providers<fcel>None<nl>
|
| 404 |
+
<ucel><ucel><fcel>● Sanofi-aventis<ucel><ucel><ucel><ucel><ucel><nl>
|
| 405 |
+
<ucel><ucel><fcel>● Schering-Plough<ucel><ucel><ucel><ucel><ucel><nl>
|
| 406 |
+
<fcel>Benjamin Hatten<fcel>Organizational Reviewer–ACEP<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
|
| 407 |
+
<fcel>Loren F. Hiratzka<fcel>Organizational Reviewer–STS<fcel>None<fcel>None<fcel>None<fcel>None<fcel>● Cardiac, Vascular, and Thoracic Surgeons*<fcel>None<nl>
|
| 408 |
+
<ucel><ucel><ucel><ucel><ucel><ucel><fcel>● TriHealth (Bethesda North and Good Samaritan Hospitals)*<ucel><nl>
|
| 409 |
+
<fcel>Jason H. Rogers<fcel>Organizational Reviewer–SCAI<fcel>● Ample Medical<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
|
| 410 |
+
<fcel>Vincenza T. Show<fcel>Organizational Reviewer–ACP<fcel>None<fcel>None<fcel>None<fcel>● Boehringer Ingelheim*<fcel>● ACP*<fcel>None<nl>
|
| 411 |
+
<ucel><ucel><ucel><ucel><ucel><fcel>● Bristol-Myers Squibb*<ucel><ucel><nl>
|
| 412 |
+
"""
|
| 413 |
+
print(convert_otsl_to_html(b))
|
requirements.txt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Flash Attention - CUDA 12, PyTorch 2.6, Python 3.10
|
| 2 |
+
flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
| 3 |
+
|
| 4 |
+
# Core ML/AI Libraries
|
| 5 |
+
torch==2.6.0
|
| 6 |
+
torchvision
|
| 7 |
+
accelerate>=0.24.0
|
| 8 |
+
|
| 9 |
+
# Transformers - using version compatible with both sets of models
|
| 10 |
+
transformers==4.57.1
|
| 11 |
+
tokenizers>=0.20.3
|
| 12 |
+
transformers-stream-generator
|
| 13 |
+
|
| 14 |
+
# Hugging Face
|
| 15 |
+
huggingface_hub
|
| 16 |
+
hf_xet
|
| 17 |
+
spaces>=0.20.0
|
| 18 |
+
|
| 19 |
+
# Vision & Image Processing
|
| 20 |
+
qwen-vl-utils
|
| 21 |
+
|
| 22 |
+
# Web Interface
|
| 23 |
+
gradio==5.9.1
|
| 24 |
+
pydantic==2.10.6
|