Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| from dotenv import load_dotenv | |
| from llama_index.core import SimpleDirectoryReader | |
| from llama_parse import LlamaParse | |
| load_dotenv() | |
| MIN_PARAGRAPH_LENGTH = 50 | |
| def extract_paragraphs(markdown_text): | |
| """ | |
| Extract paragraphs from a markdown text. | |
| """ | |
| # Split the text into paragraphs using regex | |
| paragraphs = re.split(r"\n\n+", markdown_text) | |
| # Remove leading and trailing whitespaces from each paragraph | |
| paragraphs = [p.strip() for p in paragraphs if p.strip()] | |
| paragraphs = [ | |
| p | |
| for p in paragraphs | |
| if len(p) >= MIN_PARAGRAPH_LENGTH and not p.startswith("#") | |
| ] | |
| print(f"created {len(paragraphs)} paragraphs\n", paragraphs) | |
| return paragraphs | |
| def extract_endpoint_llama(file_paths): | |
| """ | |
| Extract PDFs using LlamaParse. | |
| """ | |
| # set up parser | |
| parser = LlamaParse(result_type="markdown") # "markdown" and "text" are available | |
| # use SimpleDirectoryReader to parse our file | |
| file_extractor = {".pdf": parser} | |
| documents = SimpleDirectoryReader( | |
| input_files=file_paths, file_extractor=file_extractor | |
| ).load_data() | |
| extracted_data = [] | |
| for doc in documents: | |
| print(doc.text[:500]) | |
| paragraphs = extract_paragraphs(doc.text) | |
| data = { | |
| "paper": doc.metadata["file_name"], | |
| "chunks": paragraphs, | |
| } | |
| extracted_data.append(data) | |
| df = pd.DataFrame(extracted_data) | |
| return [extracted_data, df] | |