Spaces:
Runtime error
Runtime error
| import textract | |
| from datasets import Dataset as hfd | |
| from sentence_transformers import SentenceTransformer | |
| from config import FEATURE_EXTRACTOR_CHECKPOINT | |
| FEATURE_EXTRACTOR = SentenceTransformer(FEATURE_EXTRACTOR_CHECKPOINT) | |
| def encode_sentence(instance: hfd, text_col: str): | |
| return { | |
| "embedding": FEATURE_EXTRACTOR.encode( | |
| instance[text_col], normalize_embeddings=True | |
| ) | |
| } | |
| def parse_pdf(pdf_path: str): | |
| """Gets text from a pdf file using textract""" | |
| txt = textract.process(pdf_path, method="pdfminer", encoding="latin-1").decode() | |
| return txt | |
| def chunk_text(text: str, split_sentence="ARTÍCULO"): | |
| """creates chunks of texts using a split_sentence""" | |
| chunks = [ | |
| {"chunk": split_sentence + " " + c.replace("\n", " ").strip()} | |
| for c in text.split(split_sentence) | |
| ] | |
| return chunks | |
| def create_df(text_chunks: list[dict[str]]): | |
| "creates a HuggingFace dataset based on a list of dicts [str,str]" | |
| return hfd.from_list(text_chunks) | |