Carkham commited on
Commit
0373e86
·
verified ·
1 Parent(s): 48d7965

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/example/llm-raw-the-eye-o.O-1995_2418.pdf_1.jpg filter=lfs diff=lfs merge=lfs -text
37
+ assets/example/docstructbench_llm-raw-scihub-o.O-ijc.22994.pdf_3_5.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/example/table_photo_chn_35.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/example/table_photo_eng_23.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/example/table_scan_chn_1.png filter=lfs diff=lfs merge=lfs -text
41
+ assets/example/table_scan_chn_37.png filter=lfs diff=lfs merge=lfs -text
42
+ assets/example/table_scan_eng_12.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,16 @@
1
  ---
2
- title: TRivia 3B
3
- emoji: 🔥
4
- colorFrom: indigo
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 6.0.2
8
  app_file: app.py
9
  pinned: false
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: TRivia-3B
3
+ emoji: 🚀
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ short_description: Demo for TRivia
12
+ models:
13
+ - opendatalab/TRivia-3B
14
  ---
15
 
16
+ https://arxiv.org/abs/2512.01248
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["GRADIO_TEMP_DIR"] = "./tmp"
3
+
4
+ import time
5
+ import torch
6
+ import spaces
7
+ import tempfile
8
+ import sys
9
+ import gradio as gr
10
+ from io import StringIO
11
+ from contextlib import contextmanager
12
+ from threading import Thread
13
+ from PIL import Image
14
+ from transformers import (
15
+ AutoProcessor,
16
+ AutoModelForCausalLM,
17
+ AutoModel,
18
+ AutoTokenizer,
19
+ Qwen2_5_VLForConditionalGeneration,
20
+ TextIteratorStreamer
21
+ )
22
+ from huggingface_hub import snapshot_download
23
+ from qwen_vl_utils import process_vision_info
24
+ from otsl_utils import convert_otsl_to_html
25
+
26
+ # == download weights ==
27
+ # model_dir = snapshot_download('opendatalab/TRivia-3B', local_dir='./models/TRivia-3B')
28
+ # == select device ==
29
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
30
+
31
+ # Load TRivia-3B
32
+ try:
33
+ MODEL_ID = "opendatalab/TRivia-3B"
34
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
35
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
+ MODEL_ID,
37
+ trust_remote_code=True,
38
+ torch_dtype=torch.float16,
39
+ device_map="auto"
40
+ ).eval()
41
+ print("✓ TRivia-3B loaded")
42
+ except Exception as e:
43
+ model = None
44
+ processor = None
45
+
46
+ @spaces.GPU
47
+ def recognize_image(image: Image.Image,
48
+ max_new_tokens: int, temperature: float):
49
+ if image is None:
50
+ yield "Please upload an image.", "Please upload an image."
51
+ return
52
+
53
+ try:
54
+ # Prepare messages in chat format
55
+ messages = [{
56
+ "role": "user",
57
+ "content": [
58
+ {"type": "text", "text": "You are an AI specialized in recognizing and extracting table from images. Your mission is to analyze the table image and generate the result in OTSL format using specified tags. Output only the results without any other words and explanation."},
59
+ {"type": "image"},
60
+ ]
61
+ }]
62
+
63
+ prompt_full = processor.apply_chat_template(
64
+ messages,
65
+ tokenize=False,
66
+ add_generation_prompt=True
67
+ )
68
+
69
+ inputs = processor(
70
+ text=[prompt_full],
71
+ images=[image],
72
+ return_tensors="pt",
73
+ padding=True
74
+ ).to(device)
75
+
76
+ streamer = TextIteratorStreamer(
77
+ processor.tokenizer if hasattr(processor, 'tokenizer') else processor,
78
+ skip_prompt=True,
79
+ skip_special_tokens=True
80
+ )
81
+
82
+
83
+ generation_kwargs = {
84
+ **inputs,
85
+ "streamer": streamer,
86
+ "max_new_tokens": max_new_tokens,
87
+ "do_sample": True,
88
+ "temperature": temperature,
89
+ "repetition_penalty": 1.05,
90
+ }
91
+
92
+
93
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
94
+ thread.start()
95
+
96
+
97
+ # Stream the results
98
+ buffer = ""
99
+ for new_text in streamer:
100
+ buffer += new_text
101
+ buffer = buffer.replace("<|im_end|>", "")
102
+ html_text = convert_otsl_to_html(buffer)
103
+ time.sleep(0.01)
104
+ yield buffer, html_text, html_text
105
+
106
+
107
+ # Ensure thread completes
108
+ thread.join()
109
+
110
+
111
+ except Exception as e:
112
+ error_msg = f"Error during generation: {str(e)}"
113
+ print(f"Full error: {e}")
114
+ import traceback
115
+ traceback.print_exc()
116
+ yield error_msg, error_msg, error_msg
117
+
118
+ def gradio_reset():
119
+ return gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)
120
+
121
+
122
+ if __name__ == "__main__":
123
+ with open("header.html", "r") as file:
124
+ header = file.read()
125
+ with gr.Blocks() as demo:
126
+ gr.HTML(header)
127
+
128
+ with gr.Row():
129
+ with gr.Column():
130
+
131
+ input_img = gr.Image(label=" ", interactive=True)
132
+ with gr.Row():
133
+ clear = gr.Button(value="Clear")
134
+ predict = gr.Button(value="Table Recognition", interactive=True, variant="primary")
135
+
136
+ with gr.Accordion("Advanced Settings", open=False):
137
+ max_tokens = gr.Slider(
138
+ minimum=1,
139
+ maximum=8192,
140
+ value=4096,
141
+ step=1,
142
+ label="Max New Tokens"
143
+ )
144
+ temperature = gr.Slider(
145
+ minimum=0.1,
146
+ maximum=2.0,
147
+ value=0.1,
148
+ step=0.1,
149
+ label="Temperature"
150
+ )
151
+
152
+ with gr.Accordion("Examples:"):
153
+ example_root = os.path.join(os.path.dirname(__file__), "assets", "example")
154
+ gr.Examples(
155
+ examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
156
+ _.endswith("png")],
157
+ inputs=[input_img],
158
+ )
159
+ with gr.Column():
160
+ rendered_html = gr.Markdown(label="Rendered HTML:", show_label=True)
161
+ output_html = gr.Textbox(label="Converted HTML:", interactive=False)
162
+ pred_otsl = gr.Textbox(label="Predicted OTSL:", interactive=False)
163
+
164
+ clear.click(gradio_reset, inputs=None, outputs=[input_img, pred_otsl, output_html, rendered_html])
165
+ predict.click(recognize_image, inputs=[input_img, max_tokens, temperature], outputs=[pred_otsl, output_html, rendered_html])
166
+
167
+ demo.launch(server_name="0.0.0.0", server_port=10041, debug=True)
assets/.DS_Store ADDED
Binary file (6.15 kB). View file
 
assets/example/docstructbench_llm-raw-scihub-o.O-ijc.22994.pdf_3_5.png ADDED

Git LFS Details

  • SHA256: 7aa4d1400999670fa8dd8e4577dbc425cf105ca50aaca326adcdc09e82049aef
  • Pointer size: 131 Bytes
  • Size of remote file: 190 kB
assets/example/table_photo_chn_35.png ADDED

Git LFS Details

  • SHA256: 05edd8d535dd7363b0a72cd6bc4214c09099e7785c0534da7a4f5e556d5e1296
  • Pointer size: 132 Bytes
  • Size of remote file: 1.42 MB
assets/example/table_photo_eng_23.png ADDED

Git LFS Details

  • SHA256: 267480e94337b07eb6e1829496953e81a4615d30ff7fad571d9065e887e3ca63
  • Pointer size: 132 Bytes
  • Size of remote file: 1.49 MB
assets/example/table_scan_chn_1.png ADDED

Git LFS Details

  • SHA256: b28c8c79ce6b0cadfb9ebff3c3691b5a14ad996838ebbcf126b8d1db84f7daa9
  • Pointer size: 131 Bytes
  • Size of remote file: 191 kB
assets/example/table_scan_chn_37.png ADDED

Git LFS Details

  • SHA256: d5d6571e65aa4df696fff7e76770974b1053f2600e1c01f81afb8c4bb6f50feb
  • Pointer size: 132 Bytes
  • Size of remote file: 1.03 MB
assets/example/table_scan_eng_12.png ADDED

Git LFS Details

  • SHA256: 3165c9aa5bb3b3570e97ac03c8057de916d8615e4a386eebf71e712427fce953
  • Pointer size: 131 Bytes
  • Size of remote file: 603 kB
header.html ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html><head>
2
+ <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"> -->
3
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
4
+ <style>
5
+ .link-block {
6
+ border: 1px solid transparent;
7
+ border-radius: 24px;
8
+ background-color: rgba(54, 54, 54, 1);
9
+ cursor: pointer !important;
10
+ }
11
+ .link-block:hover {
12
+ background-color: rgba(54, 54, 54, 0.75) !important;
13
+ cursor: pointer !important;
14
+ }
15
+ .external-link {
16
+ display: inline-flex;
17
+ align-items: center;
18
+ height: 36px;
19
+ line-height: 36px;
20
+ padding: 0 16px;
21
+ cursor: pointer !important;
22
+ }
23
+ .external-link,
24
+ .external-link:hover {
25
+ cursor: pointer !important;
26
+ }
27
+ a {
28
+ text-decoration: none;
29
+ }
30
+ </style></head>
31
+
32
+ <body>
33
+ <div style="
34
+ display: flex;
35
+ flex-direction: column;
36
+ justify-content: center;
37
+ align-items: center;
38
+ text-align: center;
39
+ background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
40
+ padding: 24px;
41
+ gap: 24px;
42
+ border-radius: 8px;
43
+ ">
44
+ <div style="
45
+ display: flex;
46
+ flex-direction: column;
47
+ align-items: center;
48
+ gap: 16px;
49
+ ">
50
+ <div style="display: flex; flex-direction: column; gap: 8px">
51
+ <h1 style="
52
+ font-size: 48px;
53
+ color: #fafafa;
54
+ margin: 0;
55
+ font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
56
+ 'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
57
+ ">
58
+ TRivia-3B: Demo
59
+ </h1>
60
+ </div>
61
+ </div>
62
+
63
+ <p style="
64
+ margin: 0;
65
+ line-height: 1.6rem;
66
+ font-size: 16px;
67
+ color: #fafafa;
68
+ opacity: 0.8;
69
+ ">
70
+ Self-supervised Fine-tuning of Vision-Language Models for Table Recognition.<br>
71
+ </p>
72
+ <style>
73
+ .link-block {
74
+ display: inline-block;
75
+ }
76
+ .link-block + .link-block {
77
+ margin-left: 20px;
78
+ }
79
+ </style>
80
+
81
+ <div class="column has-text-centered">
82
+ <div class="publication-links">
83
+ <!-- Code Link. -->
84
+ <span class="link-block">
85
+ <a href="https://github.com/opendatalab/TRivia" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
86
+ <span class="icon" style="margin-right: 4px">
87
+ <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
88
+ </span>
89
+ <span style="color: white">Code</span>
90
+ </a>
91
+ </span>
92
+
93
+ <!-- Code Link. -->
94
+ <span class="link-block">
95
+ <a href="https://huggingface.co/opendatalab/TRivia-3B" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
96
+ <span class="icon" style="margin-right: 4px">
97
+ <i class="fas fa-archive" style="color: white; margin-right: 4px"></i>
98
+ </span>
99
+ <span style="color: white">Code</span>
100
+ </a>
101
+ </span>
102
+
103
+ <!-- Paper Link. -->
104
+ <span class="link-block">
105
+ <a href="https://arxiv.org/abs/2512.01248" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
106
+ <span class="icon" style="margin-right: 8px">
107
+ <i class="fas fa-file" style="color: white"></i>
108
+ </span>
109
+ <span style="color: white">Paper</span>
110
+ </a>
111
+ </span>
112
+ </div>
113
+ </div>
114
+
115
+ <!-- New Demo Links -->
116
+ </div>
117
+
118
+
119
+ </body></html>
otsl_utils.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import itertools
3
+ import html
4
+ from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
5
+ from pydantic import (
6
+ AnyUrl,
7
+ BaseModel,
8
+ ConfigDict,
9
+ Field,
10
+ StringConstraints,
11
+ computed_field,
12
+ field_validator,
13
+ model_validator,
14
+ )
15
+
16
+ class TableCell(BaseModel):
17
+ """TableCell."""
18
+ row_span: int = 1
19
+ col_span: int = 1
20
+ start_row_offset_idx: int
21
+ end_row_offset_idx: int
22
+ start_col_offset_idx: int
23
+ end_col_offset_idx: int
24
+ text: str
25
+ column_header: bool = False
26
+ row_header: bool = False
27
+ row_section: bool = False
28
+
29
+ @model_validator(mode="before")
30
+ @classmethod
31
+ def from_dict_format(cls, data: Any) -> Any:
32
+ """from_dict_format."""
33
+ if isinstance(data, Dict):
34
+ # Check if this is a native BoundingBox or a bbox from docling-ibm-models
35
+ if (
36
+ # "bbox" not in data
37
+ # or data["bbox"] is None
38
+ # or isinstance(data["bbox"], BoundingBox)
39
+ "text"
40
+ in data
41
+ ):
42
+ return data
43
+ text = data["bbox"].get("token", "")
44
+ if not len(text):
45
+ text_cells = data.pop("text_cell_bboxes", None)
46
+ if text_cells:
47
+ for el in text_cells:
48
+ text += el["token"] + " "
49
+
50
+ text = text.strip()
51
+ data["text"] = text
52
+
53
+ return data
54
+
55
+
56
+ class TableData(BaseModel): # TBD
57
+ """BaseTableData."""
58
+
59
+ table_cells: List[TableCell] = []
60
+ num_rows: int = 0
61
+ num_cols: int = 0
62
+
63
+ @computed_field # type: ignore
64
+ @property
65
+ def grid(
66
+ self,
67
+ ) -> List[List[TableCell]]:
68
+ """grid."""
69
+ # Initialise empty table data grid (only empty cells)
70
+ table_data = [
71
+ [
72
+ TableCell(
73
+ text="",
74
+ start_row_offset_idx=i,
75
+ end_row_offset_idx=i + 1,
76
+ start_col_offset_idx=j,
77
+ end_col_offset_idx=j + 1,
78
+ )
79
+ for j in range(self.num_cols)
80
+ ]
81
+ for i in range(self.num_rows)
82
+ ]
83
+
84
+ # Overwrite cells in table data for which there is actual cell content.
85
+ for cell in self.table_cells:
86
+ for i in range(
87
+ min(cell.start_row_offset_idx, self.num_rows),
88
+ min(cell.end_row_offset_idx, self.num_rows),
89
+ ):
90
+ for j in range(
91
+ min(cell.start_col_offset_idx, self.num_cols),
92
+ min(cell.end_col_offset_idx, self.num_cols),
93
+ ):
94
+ table_data[i][j] = cell
95
+
96
+ return table_data
97
+
98
+ """
99
+ OTSL
100
+ """
101
+ OTSL_NL = "<nl>"
102
+ OTSL_FCEL = "<fcel>"
103
+ OTSL_ECEL = "<ecel>"
104
+ OTSL_LCEL = "<lcel>"
105
+ OTSL_UCEL = "<ucel>"
106
+ OTSL_XCEL = "<xcel>"
107
+
108
+ def otsl_extract_tokens_and_text(s: str):
109
+ # Pattern to match anything enclosed by < >
110
+ # (including the angle brackets themselves)
111
+ # pattern = r"(<[^>]+>)"
112
+ pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")"
113
+ # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
114
+ tokens = re.findall(pattern, s)
115
+ # Remove any tokens that start with "<loc_"
116
+ tokens = [token for token in tokens]
117
+ # Split the string by those tokens to get the in-between text
118
+ text_parts = re.split(pattern, s)
119
+ text_parts = [token for token in text_parts]
120
+ # Remove any empty or purely whitespace strings from text_parts
121
+ text_parts = [part for part in text_parts if part.strip()]
122
+
123
+ return tokens, text_parts
124
+
125
+ def otsl_parse_texts(texts, tokens):
126
+ split_word = OTSL_NL
127
+ split_row_tokens = [
128
+ list(y)
129
+ for x, y in itertools.groupby(tokens, lambda z: z == split_word)
130
+ if not x
131
+ ]
132
+ table_cells = []
133
+ r_idx = 0
134
+ c_idx = 0
135
+
136
+ # 检查并补充矩阵以使其完整
137
+ if split_row_tokens:
138
+ # 找到最大列数
139
+ max_cols = max(len(row) for row in split_row_tokens)
140
+
141
+ # 补充每一行使其达到最大列数
142
+ for row_idx, row in enumerate(split_row_tokens):
143
+ while len(row) < max_cols:
144
+ row.append(OTSL_ECEL)
145
+
146
+ # 在texts中也需要相应补充<ecel>
147
+ # 重新构建texts以包含补充的<ecel>
148
+ new_texts = []
149
+ text_idx = 0
150
+
151
+ for row_idx, row in enumerate(split_row_tokens):
152
+ for col_idx, token in enumerate(row):
153
+ new_texts.append(token)
154
+ # 如果这个token在原始texts中有对应的文本内容,添加它
155
+ if text_idx < len(texts) and texts[text_idx] == token:
156
+ text_idx += 1
157
+ # 检查下一个是否是��本内容(不是token)
158
+ if (text_idx < len(texts) and
159
+ texts[text_idx] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]):
160
+ new_texts.append(texts[text_idx])
161
+ text_idx += 1
162
+
163
+ new_texts.append(OTSL_NL)
164
+ if text_idx < len(texts) and texts[text_idx] == OTSL_NL:
165
+ text_idx += 1
166
+
167
+ texts = new_texts
168
+
169
+ def count_right(tokens, c_idx, r_idx, which_tokens):
170
+ span = 0
171
+ c_idx_iter = c_idx
172
+ while tokens[r_idx][c_idx_iter] in which_tokens:
173
+ c_idx_iter += 1
174
+ span += 1
175
+ if c_idx_iter >= len(tokens[r_idx]):
176
+ return span
177
+ return span
178
+
179
+ def count_down(tokens, c_idx, r_idx, which_tokens):
180
+ span = 0
181
+ r_idx_iter = r_idx
182
+ while tokens[r_idx_iter][c_idx] in which_tokens:
183
+ r_idx_iter += 1
184
+ span += 1
185
+ if r_idx_iter >= len(tokens):
186
+ return span
187
+ return span
188
+
189
+ for i, text in enumerate(texts):
190
+ cell_text = ""
191
+ if text in [
192
+ OTSL_FCEL,
193
+ OTSL_ECEL,
194
+ ]:
195
+ row_span = 1
196
+ col_span = 1
197
+ right_offset = 1
198
+ if text != OTSL_ECEL and (texts[i + 1] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]):
199
+ cell_text = texts[i + 1]
200
+ right_offset = 2
201
+
202
+ # Check next element(s) for lcel / ucel / xcel,
203
+ # set properly row_span, col_span
204
+ next_right_cell = ""
205
+ if i + right_offset < len(texts):
206
+ next_right_cell = texts[i + right_offset]
207
+
208
+ next_bottom_cell = ""
209
+ if r_idx + 1 < len(split_row_tokens):
210
+ if c_idx < len(split_row_tokens[r_idx + 1]):
211
+ next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
212
+
213
+ if next_right_cell in [
214
+ OTSL_LCEL,
215
+ OTSL_XCEL,
216
+ ]:
217
+ # we have horisontal spanning cell or 2d spanning cell
218
+ col_span += count_right(
219
+ split_row_tokens,
220
+ c_idx + 1,
221
+ r_idx,
222
+ [OTSL_LCEL, OTSL_XCEL],
223
+ )
224
+ if next_bottom_cell in [
225
+ OTSL_UCEL,
226
+ OTSL_XCEL,
227
+ ]:
228
+ # we have a vertical spanning cell or 2d spanning cell
229
+ row_span += count_down(
230
+ split_row_tokens,
231
+ c_idx,
232
+ r_idx + 1,
233
+ [OTSL_UCEL, OTSL_XCEL],
234
+ )
235
+
236
+ table_cells.append(
237
+ TableCell(
238
+ text=cell_text.strip(),
239
+ row_span=row_span,
240
+ col_span=col_span,
241
+ start_row_offset_idx=r_idx,
242
+ end_row_offset_idx=r_idx + row_span,
243
+ start_col_offset_idx=c_idx,
244
+ end_col_offset_idx=c_idx + col_span,
245
+ )
246
+ )
247
+ if text in [
248
+ OTSL_FCEL,
249
+ OTSL_ECEL,
250
+ OTSL_LCEL,
251
+ OTSL_UCEL,
252
+ OTSL_XCEL,
253
+ ]:
254
+ c_idx += 1
255
+ if text == OTSL_NL:
256
+ r_idx += 1
257
+ c_idx = 0
258
+ return table_cells, split_row_tokens
259
+
260
+ # def export_to_html(table_data: TableData):
261
+ # nrows = table_data.num_rows
262
+ # ncols = table_data.num_cols
263
+ # if len(table_data.table_cells) == 0:
264
+ # return ""
265
+
266
+ # body = ""
267
+ # grid = table_data.grid
268
+
269
+ # for i in range(nrows):
270
+ # body += "<tr>"
271
+ # for j in range(ncols):
272
+ # cell: TableCell = grid[i][j]
273
+
274
+ # rowspan, rowstart = (
275
+ # cell.row_span,
276
+ # cell.start_row_offset_idx,
277
+ # )
278
+ # colspan, colstart = (
279
+ # cell.col_span,
280
+ # cell.start_col_offset_idx,
281
+ # )
282
+
283
+ # if rowstart != i:
284
+ # continue
285
+ # if colstart != j:
286
+ # continue
287
+
288
+ # content = html.escape(cell.text.strip())
289
+ # celltag = "td"
290
+ # if cell.column_header:
291
+ # celltag = "th"
292
+
293
+ # opening_tag = f"{celltag}"
294
+ # if rowspan > 1:
295
+ # opening_tag += f' rowspan="{rowspan}"'
296
+ # if colspan > 1:
297
+ # opening_tag += f' colspan="{colspan}"'
298
+
299
+ # body += f"<{opening_tag}>{content}</{celltag}>"
300
+ # body += "</tr>"
301
+
302
+ # # dir = get_text_direction(text)
303
+ # body = f"<table>{body}</table>"
304
+
305
+ # return body
306
+
307
+ def export_to_html(table_data: TableData) -> str:
308
+ nrows = table_data.num_rows
309
+ ncols = table_data.num_cols
310
+ # print(nrows, ncols)
311
+
312
+ if not table_data.table_cells:
313
+ return ""
314
+
315
+ current_grid = table_data.grid
316
+
317
+ html_str_list = []
318
+
319
+ for i in range(nrows):
320
+ html_str_list.append("<tr>")
321
+ for j in range(ncols):
322
+ cell: TableCell = current_grid[i][j]
323
+
324
+ if cell.start_row_offset_idx != i or cell.start_col_offset_idx != j:
325
+ continue
326
+
327
+ # content = html.escape(cell.text.strip())
328
+ content = cell.text.strip()
329
+ cell_tag_name = "th" if cell.column_header else "td"
330
+
331
+ opening_tag_parts = [f"<{cell_tag_name}"]
332
+ if cell.row_span > 1:
333
+ opening_tag_parts.append(f' rowspan="{cell.row_span}"')
334
+ if cell.col_span > 1:
335
+ opening_tag_parts.append(f' colspan="{cell.col_span}"')
336
+ opening_tag_parts.append(">")
337
+ opening_tag = "".join(opening_tag_parts)
338
+
339
+ html_str_list.append(f"{opening_tag}{content}</{cell_tag_name}>")
340
+ html_str_list.append("</tr>")
341
+
342
+ body_content = "".join(html_str_list)
343
+ return f"<table>{body_content}</table>"
344
+
345
+ def convert_otsl_to_html(otsl_content: str) -> str:
346
+ # if not otsl_content.endswith("<nl>\n"):
347
+ # return ""
348
+
349
+ tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
350
+
351
+ table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
352
+
353
+ table_data = TableData(
354
+ num_rows=len(split_row_tokens),
355
+ num_cols=(
356
+ max(len(row) for row in split_row_tokens) if split_row_tokens else 0
357
+ ),
358
+ table_cells=table_cells,
359
+ )
360
+
361
+ result = export_to_html(table_data)
362
+
363
+ return result
364
+
365
+ if __name__ == "__main__":
366
+ import time
367
+
368
+ # test
369
+ a = """
370
+ <fcel><nl>\n
371
+ <fcel><nl>\n"""
372
+ b = """<fcel>Reviewer<fcel>Representation<fcel>Consultant<fcel>Speaker's Bureau<fcel>Ownership/ Partnership/ Principal<fcel>Personal Research<fcel>Institutional, Organizational, or Other Financial Benefit<fcel>Expert Witness<nl>
373
+ <fcel>John E. Brush<fcel>Official Reviewer–ACCF Board of Trustees<fcel>● United Healthcare<fcel>None<fcel>None<fcel>None<fcel>● PROMETHEUS Payment (Board member)<fcel>None<nl>
374
+ <fcel>David P. Faxon<fcel>Official Reviewer–AHA<fcel>● Johnson & Johnson<fcel>None<fcel>● CULPRIT Trial (PI)*<fcel>None<fcel>● Circulation: Cardiovascular Interventions—Editor*<fcel>None<nl>
375
+ <ucel><ucel><ucel><ucel><fcel>● RIVA Medical<ucel><ucel><ucel><nl>
376
+ <fcel>Robert A. Harrington<fcel>Official Reviewer–AHA<fcel>● AstraZeneca*<fcel>None<fcel>None<fcel>● AstraZeneca<fcel>None<fcel>None<nl>
377
+ <ucel><ucel><fcel>● Baxter<ucel><ucel><fcel>● Baxter<ucel><ucel><nl>
378
+ <ucel><ucel><fcel>● CSL Behring<ucel><ucel><fcel>● Bristol-Myers Squibb*<ucel><ucel><nl>
379
+ <ucel><ucel><fcel>● Eli Lilly<ucel><ucel><fcel>● GlaxoSmithKline<ucel><ucel><nl>
380
+ <ucel><ucel><fcel>● Luiypold<ucel><ucel><fcel>● The Medicines Company<ucel><ucel><nl>
381
+ <ucel><ucel><fcel>● Merck<ucel><ucel><fcel>● Merck*<ucel><ucel><nl>
382
+ <ucel><ucel><fcel>● Novartis<ucel><ucel><fcel>● Portola*<ucel><ucel><nl>
383
+ <ucel><ucel><fcel>● Otsuka Maryland Research Institute<ucel><ucel><fcel>● Schering-Plough*<ucel><ucel><nl>
384
+ <ucel><ucel><fcel>● Regado<ucel><ucel><ucel><ucel><ucel><nl>
385
+ <ucel><ucel><fcel>● Sanofi-aventis<ucel><ucel><ucel><ucel><ucel><nl>
386
+ <ucel><ucel><fcel>● Schering-Plough*<ucel><ucel><ucel><ucel><ucel><nl>
387
+ <ucel><ucel><fcel>● WebMD*<ucel><ucel><ucel><ucel><ucel><nl>
388
+ <fcel>Judith S. Hochman<fcel>Official Reviewer–ACCF/AHA Task Force on Practice Guidelines<fcel>● BMS/Sanofi<fcel>None<fcel>None<fcel>● Johnson & Johnson/Bayer Healthcare AG (DSMB)<fcel>None<fcel>None<nl>
389
+ <ucel><ucel><fcel>● Eli Lilly<ucel><ucel><ucel><ucel><ucel><nl>
390
+ <ucel><ucel><fcel>● GlaxoSmithKline<ucel><ucel><ucel><ucel><ucel><nl>
391
+ <ucel><ucel><fcel>● Millennium Pharmaceuticals/ Schering-Plough<ucel><ucel><fcel>● Schering-Plough (TIMI 50) (DSMB)<ucel><ucel><nl>
392
+ <fcel>Rodney H. Zimmermann<fcel>Official Reviewer–ACCF Board of Governors<fcel>● AstraZeneca<fcel>● AstraZeneca<fcel>None<fcel>● AstraZeneca<fcel>None<fcel>None<nl>
393
+ <ucel><ucel><fcel>● Boehringer Ingelheim<fcel>● Merck-Frost<fcel>● Sanofi-aventis<ucel><fcel>● Sanofi-aventis<ucel><nl>
394
+ <ucel><ucel><fcel>● Bristol-Myers Squibb<fcel>● Servier<ucel><ucel><ucel><ucel><nl>
395
+ <ucel><ucel><fcel>● Medtronic<ucel><ucel><ucel><ucel><ucel><nl>
396
+ <ucel><ucel><fcel>● Sanofi-aventis<ucel><ucel><ucel><ucel><ucel><nl>
397
+ <ucel><ucel><fcel>● Schering-Plough<ucel><ucel><ucel><ucel><ucel><nl>
398
+ <fcel>Steven Brown<fcel>Organizational Reviewer–AAFP<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
399
+ <fcel>Joseph C. Cleveland<fcel>Organizational Reviewer–STS<fcel>● Baxter Biosurgery<fcel>None<fcel>None<fcel>None<fcel>● Heartware<fcel>None<nl>
400
+ <ucel><ucel><fcel>● Essential Pharmaceuticals<ucel><ucel><ucel><fcel>● Thoratec<ucel><nl>
401
+ <fcel>Wyatt Decker<fcel>Organizational Reviewer–ACEP<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
402
+ <fcel>Joseph A. de Gregorio<fcel>Organizational Reviewer–SCAI<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
403
+ <fcel>Deborah B. Diercks<fcel>Organizational Reviewer–ACEP<fcel>● AstraZeneca<fcel>None<fcel>None<fcel>None<fcel>● Society of Chest Pain Centers and Providers<fcel>None<nl>
404
+ <ucel><ucel><fcel>● Sanofi-aventis<ucel><ucel><ucel><ucel><ucel><nl>
405
+ <ucel><ucel><fcel>● Schering-Plough<ucel><ucel><ucel><ucel><ucel><nl>
406
+ <fcel>Benjamin Hatten<fcel>Organizational Reviewer–ACEP<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
407
+ <fcel>Loren F. Hiratzka<fcel>Organizational Reviewer–STS<fcel>None<fcel>None<fcel>None<fcel>None<fcel>● Cardiac, Vascular, and Thoracic Surgeons*<fcel>None<nl>
408
+ <ucel><ucel><ucel><ucel><ucel><ucel><fcel>● TriHealth (Bethesda North and Good Samaritan Hospitals)*<ucel><nl>
409
+ <fcel>Jason H. Rogers<fcel>Organizational Reviewer–SCAI<fcel>● Ample Medical<fcel>None<fcel>None<fcel>None<fcel>None<fcel>None<nl>
410
+ <fcel>Vincenza T. Show<fcel>Organizational Reviewer–ACP<fcel>None<fcel>None<fcel>None<fcel>● Boehringer Ingelheim*<fcel>● ACP*<fcel>None<nl>
411
+ <ucel><ucel><ucel><ucel><ucel><fcel>● Bristol-Myers Squibb*<ucel><ucel><nl>
412
+ """
413
+ print(convert_otsl_to_html(b))
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Flash Attention - CUDA 12, PyTorch 2.6, Python 3.10
2
+ flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
3
+
4
+ # Core ML/AI Libraries
5
+ torch==2.6.0
6
+ torchvision
7
+ accelerate>=0.24.0
8
+
9
+ # Transformers - using version compatible with both sets of models
10
+ transformers==4.57.1
11
+ tokenizers>=0.20.3
12
+ transformers-stream-generator
13
+
14
+ # Hugging Face
15
+ huggingface_hub
16
+ hf_xet
17
+ spaces>=0.20.0
18
+
19
+ # Vision & Image Processing
20
+ qwen-vl-utils
21
+
22
+ # Web Interface
23
+ gradio==5.9.1
24
+ pydantic==2.10.6