PII-Masker / app /document_utils.py
abishekcodes's picture
Deployment Test
98abe61
import io
import fitz
from docx import Document
def process_pdf(file_stream: io.BytesIO):
doc = fitz.open(stream=file_stream, filetype="pdf")
full_text = ""
char_map = []
for page_num, page in enumerate(doc):
words = page.get_text("words")
for word in words:
x0, y0, x1, y1, word_text = word[:5]
location_info = {"page": page_num, "bbox": (x0, y0, x1, y1)}
for char in word_text:
char_map.append(location_info)
char_map.extend([None] * len(" "))
full_text += word_text + " "
doc.close()
return full_text.strip(), char_map
def process_docx(file_stream: io.BytesIO):
doc = Document(file_stream)
full_text = ""
char_map = []
for p_idx, para in enumerate(doc.paragraphs):
for r_idx, run in enumerate(para.runs):
location_info = {"p_idx": p_idx, "r_idx": r_idx}
for _ in run.text:
char_map.append(location_info)
full_text += run.text
char_map.append(None)
full_text += "\n"
return full_text.strip(), char_map
def map_ner_results(ner_results, char_map, Entity):
entities = []
for result in ner_results:
start, end = result['start'], result['end']
unique_locations = {}
for i in range(start, end):
if i < len(char_map) and char_map[i]:
loc_tuple = tuple(sorted(char_map[i].items()))
if loc_tuple not in unique_locations:
unique_locations[loc_tuple] = char_map[i]
if unique_locations:
entities.append(Entity(
text=result['word'],
label=result['entity_group'],
location=list(unique_locations.values())
))
return entities