File size: 1,250 Bytes
26cf0a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import tempfile
import nest_asyncio
from llama_parse import LlamaParse
from llama_index.core.node_parser import SimpleNodeParser
from dotenv import load_dotenv

def pdf_parser(pdf_file):
    load_dotenv()
    api_key = os.getenv("LLAMA_CLOUD_API_KEY")
    if not api_key:
        raise ValueError("LLAMA_CLOUD_API_KEY is not set in the environment variables.")

    pdf_content = []
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        temp_file.write(pdf_file.read())
        temp_file_path = temp_file.name

    nest_asyncio.apply()

    parser = LlamaParse(api_key=api_key, result_type="text", verbose=True)
    documents = parser.load_data(temp_file_path)
    
    node_parser = SimpleNodeParser.from_defaults(chunk_size=500, chunk_overlap=20)
    nodes = node_parser.get_nodes_from_documents(documents)

    for node in nodes:
        pdf_content.append({
            'text': node.get_content(),
            'page': node.metadata.get('page_label', 'N/A')
        })

    os.unlink(temp_file_path)
    print(f"Parsed {len(pdf_content)} chunks from the PDF.")
    print(f"{pdf_content}")
    return pdf_content


if __name__ == "__main__":
    pdf_parser(open("FP-Juliett-Final-Report.pdf", "rb"))