Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Paused

App Files Files Community

sofhiaazzhr commited on 28 days ago

Commit

770f26b

1 Parent(s): 29efec6

[KM-513][document] add convert to parquet if type file is XLSX and CSV

Browse files

Files changed (4) hide show

src/knowledge/parquet_service.py +77 -0
src/knowledge/processing_service.py +15 -7
src/pipeline/document_pipeline/document_pipeline.py +4 -0
src/storage/az_blob/az_blob.py +34 -0

src/knowledge/parquet_service.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Parquet service — converts, uploads, downloads, and deletes Parquet files for CSV/XLSX.
+Parquet files are stored in Azure Blob alongside the original document using
+a deterministic naming convention based on document_id:
+  CSV:        {user_id}/{document_id}.parquet
+  XLSX sheet: {user_id}/{document_id}__{safe_sheet_name}.parquet
+This allows tabular.py to construct the correct blob name at retrieval time
+without needing to store it separately, and allows document_pipeline.py to
+delete all Parquet files for a document using a prefix delete.
+"""
+import io
+import pandas as pd
+from src.middlewares.logging import get_logger
+from src.storage.az_blob.az_blob import blob_storage
+logger = get_logger("parquet_service")
+def _safe_sheet_name(sheet_name: str) -> str:
+    return sheet_name.replace("/", "_").replace(" ", "_").replace("\\", "_")
+def parquet_blob_name(user_id: str, document_id: str, sheet_name: str | None = None) -> str:
+    """Construct deterministic Parquet blob name."""
+    if sheet_name:
+        return f"{user_id}/{document_id}__{_safe_sheet_name(sheet_name)}.parquet"
+    return f"{user_id}/{document_id}.parquet"
+def _to_parquet_bytes(df: pd.DataFrame) -> bytes:
+    buf = io.BytesIO()
+    df.to_parquet(buf, index=False)
+    return buf.getvalue()
+async def upload_parquet(
+    df: pd.DataFrame,
+    user_id: str,
+    document_id: str,
+    sheet_name: str | None = None,
+) -> str:
+    """Convert DataFrame to Parquet and upload to Azure Blob. Returns blob_name."""
+    blob_name = parquet_blob_name(user_id, document_id, sheet_name)
+    parquet_bytes = _to_parquet_bytes(df)
+    await blob_storage.upload_bytes(parquet_bytes, blob_name)
+    logger.info(f"Uploaded Parquet {blob_name} ({len(parquet_bytes)} bytes)")
+    return blob_name
+async def download_parquet(
+    user_id: str,
+    document_id: str,
+    sheet_name: str | None = None,
+) -> pd.DataFrame:
+    """Download Parquet from Azure Blob and return as DataFrame."""
+    blob_name = parquet_blob_name(user_id, document_id, sheet_name)
+    content = await blob_storage.download_file(blob_name)
+    df = pd.read_parquet(io.BytesIO(content))
+    logger.info(f"Downloaded Parquet {blob_name}: {len(df)} rows, {len(df.columns)} columns")
+    return df
+async def delete_document_parquets(user_id: str, document_id: str) -> int:
+    """Delete all Parquet files for a document (CSV = 1 file, XLSX = one per sheet).
+    Uses prefix delete: {user_id}/{document_id} matches all Parquet variants
+    for this document without touching the original blob (which uses a random UUID name).
+    """
+    prefix = f"{user_id}/{document_id}"
+    deleted = await blob_storage.delete_blobs_with_prefix(prefix)
+    logger.info(f"Deleted {deleted} Parquet file(s) for document {document_id}")
+    return deleted

src/knowledge/processing_service.py CHANGED Viewed

@@ -7,6 +7,7 @@ from src.storage.az_blob.az_blob import blob_storage
 from src.db.postgres.models import Document as DBDocument
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
 from typing import List
 from datetime import datetime, timezone, timedelta
 import sys
@@ -44,9 +45,9 @@ class KnowledgeProcessingService:
             if db_doc.file_type == "pdf":
                 documents = await self._build_pdf_documents(content, db_doc)
             elif db_doc.file_type == "csv":
-                documents = self._build_csv_documents(content, db_doc)
             elif db_doc.file_type == "xlsx":
-                documents = self._build_excel_documents(content, db_doc)
             else:
                 text = self._extract_text(content, db_doc.file_type)
                 if not text.strip():
@@ -168,18 +169,25 @@ class KnowledgeProcessingService:
             ))
         return documents
-    def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
-        """Profile each column of a CSV file."""
         df = pd.read_csv(BytesIO(content))
         return self._profile_dataframe(df, db_doc.filename, db_doc)
-    def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
-        """Profile each column of every sheet in an Excel file."""
         sheets = pd.read_excel(BytesIO(content), sheet_name=None)
         documents = []
         for sheet_name, df in sheets.items():
             source_name = f"{db_doc.filename} / sheet: {sheet_name}"
-            documents.extend(self._profile_dataframe(df, source_name, db_doc))
         return documents
     def _extract_text(self, content: bytes, file_type: str) -> str:

 from src.db.postgres.models import Document as DBDocument
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
+from src.knowledge.parquet_service import upload_parquet
 from typing import List
 from datetime import datetime, timezone, timedelta
 import sys
             if db_doc.file_type == "pdf":
                 documents = await self._build_pdf_documents(content, db_doc)
             elif db_doc.file_type == "csv":
+                documents = await self._build_csv_documents(content, db_doc)
             elif db_doc.file_type == "xlsx":
+                documents = await self._build_excel_documents(content, db_doc)
             else:
                 text = self._extract_text(content, db_doc.file_type)
                 if not text.strip():
             ))
         return documents
+    async def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
+        """Profile each column of a CSV file and upload Parquet to Azure Blob."""
         df = pd.read_csv(BytesIO(content))
+        await upload_parquet(df, db_doc.user_id, db_doc.id)
+        logger.info(f"Uploaded Parquet for CSV {db_doc.id}")
         return self._profile_dataframe(df, db_doc.filename, db_doc)
+    async def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
+        """Profile each column of every sheet in an Excel file and upload one Parquet per sheet."""
         sheets = pd.read_excel(BytesIO(content), sheet_name=None)
         documents = []
         for sheet_name, df in sheets.items():
             source_name = f"{db_doc.filename} / sheet: {sheet_name}"
+            docs = self._profile_dataframe(df, source_name, db_doc)
+            for doc in docs:
+                doc.metadata["data"]["sheet_name"] = sheet_name
+            documents.extend(docs)
+            await upload_parquet(df, db_doc.user_id, db_doc.id, sheet_name)
+            logger.info(f"Uploaded Parquet for sheet '{sheet_name}' of {db_doc.id}")
         return documents
     def _extract_text(self, content: bytes, file_type: str) -> str:

src/pipeline/document_pipeline/document_pipeline.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from src.document.document_service import document_service
 from src.knowledge.processing_service import knowledge_processor
 from src.middlewares.logging import get_logger
 from src.storage.az_blob.az_blob import blob_storage
@@ -81,6 +82,9 @@ class DocumentPipeline:
         await document_service.delete_document(db, document_id)
         logger.info(f"Deleted document {document_id} for user {user_id}")
         return {"document_id": document_id}

 from src.document.document_service import document_service
 from src.knowledge.processing_service import knowledge_processor
+from src.knowledge.parquet_service import delete_document_parquets
 from src.middlewares.logging import get_logger
 from src.storage.az_blob.az_blob import blob_storage
         await document_service.delete_document(db, document_id)
+        if document.file_type in ("csv", "xlsx"):
+            await delete_document_parquets(user_id, document_id)
         logger.info(f"Deleted document {document_id} for user {user_id}")
         return {"document_id": document_id}

src/storage/az_blob/az_blob.py CHANGED Viewed

@@ -57,6 +57,22 @@ class AzureBlobStorage:
             logger.error(f"Failed to download blob {blob_name}", error=str(e))
             raise
     async def delete_file(self, blob_name: str) -> bool:
         """Delete file from Azure Blob Storage."""
         try:
@@ -71,6 +87,24 @@ class AzureBlobStorage:
             logger.error(f"Failed to delete blob {blob_name}", error=str(e))
             return False
 # Singleton instance
 blob_storage = AzureBlobStorage()

             logger.error(f"Failed to download blob {blob_name}", error=str(e))
             raise
+    async def upload_bytes(self, content: bytes, blob_name: str) -> str:
+        """Upload bytes to Azure Blob Storage using a specific blob name.
+        Unlike upload_file(), this does not generate a UUID name — caller controls the blob_name.
+        Used for Parquet files where the name must be deterministic (derived from document_id).
+        """
+        try:
+            async with self._get_blob_client(blob_name) as blob_client:
+                logger.info(f"Uploading bytes to blob {blob_name}")
+                await blob_client.upload_blob(content, overwrite=True)
+            logger.info(f"Successfully uploaded {blob_name}")
+            return blob_name
+        except Exception as e:
+            logger.error(f"Failed to upload bytes to {blob_name}", error=str(e))
+            raise
     async def delete_file(self, blob_name: str) -> bool:
         """Delete file from Azure Blob Storage."""
         try:
             logger.error(f"Failed to delete blob {blob_name}", error=str(e))
             return False
+    async def delete_blobs_with_prefix(self, prefix: str) -> int:
+        """Delete all blobs whose name starts with prefix. Returns count deleted.
+        Used to delete all Parquet files for a document in one call.
+        """
+        from azure.storage.blob.aio import ContainerClient
+        container_url = f"{self.account_url}/{self.container_name}?{self.sas_token}"
+        deleted = 0
+        try:
+            async with ContainerClient.from_container_url(container_url) as container:
+                async for blob in container.list_blobs(name_starts_with=prefix):
+                    await container.delete_blob(blob.name)
+                    deleted += 1
+            logger.info(f"Deleted {deleted} blobs with prefix {prefix}")
+        except Exception as e:
+            logger.error(f"Failed to delete blobs with prefix {prefix}", error=str(e))
+        return deleted
 # Singleton instance
 blob_storage = AzureBlobStorage()