Spaces:
Sleeping
Sleeping
File size: 5,673 Bytes
e8b46b5 704d2a2 b0a4dc4 1055fe1 b0a4dc4 704d2a2 f4b6b63 b0a4dc4 47f7e99 b0a4dc4 47f7e99 b0a4dc4 704d2a2 b0a4dc4 704d2a2 b0a4dc4 704d2a2 b0a4dc4 704d2a2 b0a4dc4 e8b46b5 b0a4dc4 704d2a2 b0a4dc4 f4b6b63 b0a4dc4 1055fe1 b0a4dc4 ab82879 1055fe1 b0a4dc4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#!/usr/bin/env python3
import re
import json
import sys
from docx import Document
from docx.oxml.ns import qn
from master_key import TABLE_SCHEMAS, HEADING_PATTERNS, PARAGRAPH_PATTERNS
def is_red_font(run):
col = run.font.color
if col and col.rgb:
r, g, b = col.rgb
if r>150 and g<100 and b<100 and (r-g)>30 and (r-b)>30:
return True
rPr = getattr(run._element, "rPr", None)
if rPr is not None:
clr = rPr.find(qn('w:color'))
if clr is not None:
val = clr.get(qn('w:val'))
if re.fullmatch(r"[0-9A-Fa-f]{6}", val):
rr, gg, bb = int(val[:2],16), int(val[2:4],16), int(val[4:],16)
if rr>150 and gg<100 and bb<100 and (rr-gg)>30 and (rr-bb)>30:
return True
return False
def _prev_para_text(tbl):
prev = tbl._tbl.getprevious()
while prev is not None and not prev.tag.endswith("}p"):
prev = prev.getprevious()
if prev is None:
return ""
return "".join(node.text for node in prev.iter() if node.tag.endswith("}t") and node.text).strip()
def match_table_schema(tbl):
# look for explicit heading constraint
heading = _prev_para_text(tbl)
headers = [c.text.strip() for c in tbl.rows[0].cells]
col0 = [r.cells[0].text.strip() for r in tbl.rows]
# 1) exact first-cell name
first = tbl.rows[0].cells[0].text.strip()
if first in TABLE_SCHEMAS:
spec = TABLE_SCHEMAS[first]
if not spec.get("headings") or any(h["text"]==heading for h in spec.get("headings",[])):
return first
# 2) any other schema with explicit headings
for name, spec in TABLE_SCHEMAS.items():
if any(h["text"]==heading for h in spec.get("headings",[])):
return name
# 3) by two-column 'columns'
for name, spec in TABLE_SCHEMAS.items():
cols = spec.get("columns")
if cols and all(col in headers for col in cols):
return name
# 4) row1 tables
for name, spec in TABLE_SCHEMAS.items():
if spec["orientation"]=="row1" and all(lbl in headers for lbl in spec["labels"]):
return name
# 5) left tables
for name, spec in TABLE_SCHEMAS.items():
if spec["orientation"]=="left" and all(lbl in col0 for lbl in spec["labels"]):
return name
return None
def extract_red_text(path):
doc = Document(path)
out = {}
# --- TABLES ---
for tbl in doc.tables:
schema = match_table_schema(tbl)
if not schema:
continue
spec = TABLE_SCHEMAS[schema]
# handle the special split_labels (row1 only)
if spec.get("split_labels") and spec["orientation"]=="row1":
cell_txt = tbl.rows[1].cells[0].text.strip()
first_lbl = spec["split_labels"][0]
narrative, _, tail = cell_txt.partition(first_lbl)
narrative = narrative.strip()
if narrative:
out.setdefault(schema, {}).setdefault(spec["labels"][0], []).append(narrative)
for i, lbl in enumerate(spec["split_labels"]):
nxt = spec["split_labels"][i+1] if i+1<len(spec["split_labels"]) else None
pattern = rf"{re.escape(lbl)}\s*(.+?)(?={re.escape(nxt)})" if nxt else rf"{re.escape(lbl)}\s*(.+)$"
m = re.search(pattern, cell_txt, flags=re.DOTALL)
if m:
val = m.group(1).strip()
out.setdefault(schema, {}).setdefault(lbl, []).append(val)
continue
# normal tables
labels = spec["labels"] + [schema]
collected = {lbl: [] for lbl in labels}
seen = {lbl: set() for lbl in labels}
by_col = (spec["orientation"]=="row1")
rows = tbl.rows[1:]
for ri, row in enumerate(rows):
for ci, cell in enumerate(row.cells):
red_txt = "".join(run.text for p in cell.paragraphs for run in p.runs if is_red_font(run)).strip()
if not red_txt:
continue
if by_col:
# column header → your defined label
lbl = spec["labels"][ci] if ci < len(spec["labels"]) else schema
else:
# first cell in this row → must be one of your labels
raw = row.cells[0].text.strip()
lbl = raw if raw in spec["labels"] else schema
if red_txt not in seen[lbl]:
seen[lbl].add(red_txt)
collected[lbl].append(red_txt)
# keep only non-empty
data = {k:v for k,v in collected.items() if v}
if data:
out[schema] = data
# --- PARAGRAPHS ---
paras = {}
for idx, para in enumerate(doc.paragraphs):
red_txt = "".join(r.text for r in para.runs if is_red_font(r)).strip()
if not red_txt:
continue
# find nearest heading above
context = None
for j in range(idx-1, -1, -1):
txt = doc.paragraphs[j].text.strip()
if txt and any(re.search(p, txt) for p in HEADING_PATTERNS["main"]+HEADING_PATTERNS["sub"]):
context = txt
break
# fallback for date line
if not context and re.fullmatch(PARAGRAPH_PATTERNS["date_line"], red_txt):
context = "Date"
paras.setdefault(context or "(para)", []).append(red_txt)
if paras:
out["paragraphs"] = paras
return out
if __name__ == "__main__":
fn = sys.argv[1] if len(sys.argv)>1 else "test.docx"
print(json.dumps(extract_red_text(fn), indent=2, ensure_ascii=False)) |