Spaces:
Runtime error
Runtime error
| import io | |
| import fitz | |
| from docx import Document | |
| def process_pdf(file_stream: io.BytesIO): | |
| doc = fitz.open(stream=file_stream, filetype="pdf") | |
| full_text = "" | |
| char_map = [] | |
| for page_num, page in enumerate(doc): | |
| words = page.get_text("words") | |
| for word in words: | |
| x0, y0, x1, y1, word_text = word[:5] | |
| location_info = {"page": page_num, "bbox": (x0, y0, x1, y1)} | |
| for char in word_text: | |
| char_map.append(location_info) | |
| char_map.extend([None] * len(" ")) | |
| full_text += word_text + " " | |
| doc.close() | |
| return full_text.strip(), char_map | |
| def process_docx(file_stream: io.BytesIO): | |
| doc = Document(file_stream) | |
| full_text = "" | |
| char_map = [] | |
| for p_idx, para in enumerate(doc.paragraphs): | |
| for r_idx, run in enumerate(para.runs): | |
| location_info = {"p_idx": p_idx, "r_idx": r_idx} | |
| for _ in run.text: | |
| char_map.append(location_info) | |
| full_text += run.text | |
| char_map.append(None) | |
| full_text += "\n" | |
| return full_text.strip(), char_map | |
| def map_ner_results(ner_results, char_map, Entity): | |
| entities = [] | |
| for result in ner_results: | |
| start, end = result['start'], result['end'] | |
| unique_locations = {} | |
| for i in range(start, end): | |
| if i < len(char_map) and char_map[i]: | |
| loc_tuple = tuple(sorted(char_map[i].items())) | |
| if loc_tuple not in unique_locations: | |
| unique_locations[loc_tuple] = char_map[i] | |
| if unique_locations: | |
| entities.append(Entity( | |
| text=result['word'], | |
| label=result['entity_group'], | |
| location=list(unique_locations.values()) | |
| )) | |
| return entities | |