Spaces:
Runtime error
Runtime error
| import os | |
| import nest_asyncio | |
| from llama_parse import LlamaParse | |
| from llama_index.core.node_parser import SimpleNodeParser | |
| from dotenv import load_dotenv | |
| from fastapi import UploadFile, HTTPException, File | |
| import fitz | |
| from script.get_metadata import Metadata | |
| load_dotenv() | |
| nest_asyncio.apply() | |
| async def parse_journal(content: bytes, file_name: str): | |
| """Parse the journal using LlamaParse.""" | |
| try: | |
| # Initialize the parser | |
| parser = LlamaParse( | |
| api_key=os.getenv("LLAMA_PARSE_API_KEY"), | |
| result_type="markdown", | |
| max_timeout=5000, | |
| ) | |
| # Load and process the document | |
| llama_parse_documents = parser.load_data( | |
| content, extra_info={"file_name": file_name} | |
| ) | |
| return llama_parse_documents | |
| except Exception as e: | |
| raise HTTPException(status_code=400, detail=f"Error processing file: {e}") | |
| async def extract_metadata(content: bytes): | |
| """Extract metadata from the PDF content.""" | |
| try: | |
| # Open the binary content with PyMuPDF | |
| pdf_document = fitz.open("pdf", content) # "pdf" specifies the format | |
| # Extract metadata | |
| metadata = pdf_document.metadata | |
| # Prepare metadata dictionary with default values for missing fields | |
| metadata_dict = { | |
| "title": metadata.get("title", "N/A"), | |
| "author": metadata.get("author", "N/A"), | |
| "subject": metadata.get("subject", "N/A"), | |
| "keywords": metadata.get("keywords", "N/A"), | |
| "creation_date": metadata.get("created", "N/A"), | |
| "modification_date": metadata.get("modified", "N/A"), | |
| } | |
| return metadata_dict | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Error inputting metadata: {e}") | |
| async def upload_file(file: UploadFile = File(...)): | |
| try: | |
| # Read the binary content of the uploaded file once | |
| content = await file.read() | |
| # Parse the journal | |
| parsed_documents = await parse_journal(content, file.filename) | |
| # Extract metadata | |
| metadata_dict = await extract_metadata(content) | |
| print("Metadata Dictionary : \n\n", metadata_dict) | |
| metadata_gen = Metadata(metadata_dict) | |
| documents_with_metadata = metadata_gen.add_metadata( | |
| parsed_documents, metadata_dict | |
| ) | |
| print("Document with Metadata : \n\n", documents_with_metadata) | |
| print("Banyak documents : \n", len(documents_with_metadata)) | |
| # Return both parsed documents and metadata | |
| return {"status": "SUCCESS"} | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Error processing file: {e}") |