Spaces:
Runtime error
Runtime error
Commit
·
661a3cb
1
Parent(s):
63d3882
fix : improve uploader
Browse files- api/function.py +10 -9
- api/router/book.py +2 -2
- script/document_uploader.py +2 -2
- service/aws_loader.py +2 -2
- service/reader_v3.py +10 -9
api/function.py
CHANGED
|
@@ -29,21 +29,22 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
|
|
| 29 |
user_id="admin_book_uploaded",
|
| 30 |
)
|
| 31 |
|
| 32 |
-
# # Upload to AWS
|
| 33 |
-
file_name = f"{reference['title']}"
|
| 34 |
-
aws_loader = Loader()
|
| 35 |
-
|
| 36 |
-
file_obj = file
|
| 37 |
-
aws_loader.upload_to_s3(file_obj, file_name)
|
| 38 |
-
|
| 39 |
uploader = Uploader(reference, file)
|
| 40 |
-
|
| 41 |
-
nodes_with_metadata = await uploader.process_documents()
|
| 42 |
|
| 43 |
# Build indexes using IndexManager
|
| 44 |
index = IndexManager()
|
| 45 |
index.build_indexes(nodes_with_metadata)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
return json.dumps(
|
| 48 |
{"status": "success", "message": "Vector Index loaded successfully."}
|
| 49 |
)
|
|
|
|
| 29 |
user_id="admin_book_uploaded",
|
| 30 |
)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
uploader = Uploader(reference, file)
|
| 33 |
+
nodes_with_metadata, file_stream = await uploader.process_documents()
|
|
|
|
| 34 |
|
| 35 |
# Build indexes using IndexManager
|
| 36 |
index = IndexManager()
|
| 37 |
index.build_indexes(nodes_with_metadata)
|
| 38 |
|
| 39 |
+
|
| 40 |
+
# # Upload to AWS
|
| 41 |
+
file_name = f"{reference['title']}"
|
| 42 |
+
aws_loader = Loader()
|
| 43 |
+
|
| 44 |
+
# file_obj = file
|
| 45 |
+
aws_loader.upload_to_s3(file_stream, file_name)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
return json.dumps(
|
| 49 |
{"status": "success", "message": "Vector Index loaded successfully."}
|
| 50 |
)
|
api/router/book.py
CHANGED
|
@@ -84,8 +84,8 @@ async def upload_file(
|
|
| 84 |
|
| 85 |
# Create a new Metadata object
|
| 86 |
book_query = BookQuery(user)
|
| 87 |
-
book_query.add_book(db, title, author, category_id, year, publisher)
|
| 88 |
-
logging.info("Database Inserted")
|
| 89 |
|
| 90 |
return {
|
| 91 |
"filename": file.filename,
|
|
|
|
| 84 |
|
| 85 |
# Create a new Metadata object
|
| 86 |
book_query = BookQuery(user)
|
| 87 |
+
# book_query.add_book(db, title, author, category_id, year, publisher)
|
| 88 |
+
# logging.info("Database Inserted")
|
| 89 |
|
| 90 |
return {
|
| 91 |
"filename": file.filename,
|
script/document_uploader.py
CHANGED
|
@@ -58,7 +58,7 @@ class Uploader:
|
|
| 58 |
|
| 59 |
# Get metadata
|
| 60 |
# documents_with_metadata = self.metadata.apply_metadata(documents)
|
| 61 |
-
documents_with_metadata = await upload_file(self.reference, self.file)
|
| 62 |
|
| 63 |
# Get Topic
|
| 64 |
# topic_extractor = extract_topic(self.reference, self.content_table)
|
|
@@ -85,7 +85,7 @@ class Uploader:
|
|
| 85 |
try:
|
| 86 |
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
|
| 87 |
# nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
|
| 88 |
-
return nodes_with_metadata
|
| 89 |
|
| 90 |
except Exception as e:
|
| 91 |
try:
|
|
|
|
| 58 |
|
| 59 |
# Get metadata
|
| 60 |
# documents_with_metadata = self.metadata.apply_metadata(documents)
|
| 61 |
+
documents_with_metadata, file_stream = await upload_file(self.reference, self.file)
|
| 62 |
|
| 63 |
# Get Topic
|
| 64 |
# topic_extractor = extract_topic(self.reference, self.content_table)
|
|
|
|
| 85 |
try:
|
| 86 |
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
|
| 87 |
# nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
|
| 88 |
+
return nodes_with_metadata, file_stream
|
| 89 |
|
| 90 |
except Exception as e:
|
| 91 |
try:
|
service/aws_loader.py
CHANGED
|
@@ -19,14 +19,14 @@ class Loader:
|
|
| 19 |
region_name="us-west-2",
|
| 20 |
)
|
| 21 |
|
| 22 |
-
def upload_to_s3(self,
|
| 23 |
try:
|
| 24 |
# If folder_name is provided, prepend it to the object_name
|
| 25 |
if folder_name:
|
| 26 |
object_name = f"{folder_name}/{object_name}"
|
| 27 |
|
| 28 |
# Open the PDF with PyMuPDF (fitz)
|
| 29 |
-
pdf_document = fitz.open(stream=
|
| 30 |
print("Jumlah halaman : ", pdf_document.page_count)
|
| 31 |
# Loop through each page of the PDF
|
| 32 |
for page_num in range(pdf_document.page_count):
|
|
|
|
| 19 |
region_name="us-west-2",
|
| 20 |
)
|
| 21 |
|
| 22 |
+
def upload_to_s3(self, file_stream: BytesIO, object_name, folder_name="summarizer"):
|
| 23 |
try:
|
| 24 |
# If folder_name is provided, prepend it to the object_name
|
| 25 |
if folder_name:
|
| 26 |
object_name = f"{folder_name}/{object_name}"
|
| 27 |
|
| 28 |
# Open the PDF with PyMuPDF (fitz)
|
| 29 |
+
pdf_document = fitz.open(stream=file_stream.getvalue(), filetype="pdf")
|
| 30 |
print("Jumlah halaman : ", pdf_document.page_count)
|
| 31 |
# Loop through each page of the PDF
|
| 32 |
for page_num in range(pdf_document.page_count):
|
service/reader_v3.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import nest_asyncio
|
|
|
|
| 3 |
|
| 4 |
from llama_parse import LlamaParse
|
| 5 |
from llama_index.core.node_parser import SimpleNodeParser
|
|
@@ -65,22 +66,22 @@ async def upload_file(reference, file: UploadFile):
|
|
| 65 |
try:
|
| 66 |
# Read the binary content of the uploaded file once
|
| 67 |
content = await file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Parse the journal
|
| 69 |
parsed_documents = parse_journal(content, file.filename)
|
| 70 |
-
# Extract metadata
|
| 71 |
-
# metadata_dict = await extract_metadata(content)
|
| 72 |
-
# print("Metadata Dictionary : \n\n", metadata_dict)
|
| 73 |
|
|
|
|
| 74 |
metadata_gen = Metadata(reference)
|
| 75 |
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
|
| 76 |
-
|
| 77 |
-
# document_with_metadata =
|
| 78 |
|
| 79 |
-
print("Document with Metadata
|
| 80 |
-
print("
|
| 81 |
|
| 82 |
-
# Return
|
| 83 |
-
return documents_with_metadata
|
| 84 |
|
| 85 |
except Exception as e:
|
| 86 |
return JSONResponse(status_code=500, content=f"Error processing file: {e}")
|
|
|
|
| 1 |
import os
|
| 2 |
import nest_asyncio
|
| 3 |
+
from io import BytesIO
|
| 4 |
|
| 5 |
from llama_parse import LlamaParse
|
| 6 |
from llama_index.core.node_parser import SimpleNodeParser
|
|
|
|
| 66 |
try:
|
| 67 |
# Read the binary content of the uploaded file once
|
| 68 |
content = await file.read()
|
| 69 |
+
|
| 70 |
+
# Store the file content in a BytesIO stream for reuse later
|
| 71 |
+
file_stream = BytesIO(content)
|
| 72 |
+
|
| 73 |
# Parse the journal
|
| 74 |
parsed_documents = parse_journal(content, file.filename)
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
# Generate metadata
|
| 77 |
metadata_gen = Metadata(reference)
|
| 78 |
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
print("Document with Metadata: \n\n", documents_with_metadata)
|
| 81 |
+
print("Number of documents: \n", len(documents_with_metadata))
|
| 82 |
|
| 83 |
+
# Return the parsed documents with metadata and the file stream
|
| 84 |
+
return documents_with_metadata, file_stream
|
| 85 |
|
| 86 |
except Exception as e:
|
| 87 |
return JSONResponse(status_code=500, content=f"Error processing file: {e}")
|