import io import fitz from docx import Document def process_pdf(file_stream: io.BytesIO): doc = fitz.open(stream=file_stream, filetype="pdf") full_text = "" char_map = [] for page_num, page in enumerate(doc): words = page.get_text("words") for word in words: x0, y0, x1, y1, word_text = word[:5] location_info = {"page": page_num, "bbox": (x0, y0, x1, y1)} for char in word_text: char_map.append(location_info) char_map.extend([None] * len(" ")) full_text += word_text + " " doc.close() return full_text.strip(), char_map def process_docx(file_stream: io.BytesIO): doc = Document(file_stream) full_text = "" char_map = [] for p_idx, para in enumerate(doc.paragraphs): for r_idx, run in enumerate(para.runs): location_info = {"p_idx": p_idx, "r_idx": r_idx} for _ in run.text: char_map.append(location_info) full_text += run.text char_map.append(None) full_text += "\n" return full_text.strip(), char_map def map_ner_results(ner_results, char_map, Entity): entities = [] for result in ner_results: start, end = result['start'], result['end'] unique_locations = {} for i in range(start, end): if i < len(char_map) and char_map[i]: loc_tuple = tuple(sorted(char_map[i].items())) if loc_tuple not in unique_locations: unique_locations[loc_tuple] = char_map[i] if unique_locations: entities.append(Entity( text=result['word'], label=result['entity_group'], location=list(unique_locations.values()) )) return entities