Spaces:
Sleeping
Sleeping
fix: replace HF bucket storage with DB storage (files stored in Neon PostgreSQL)
Browse files- .env +19 -6
- app/application/document_service.py +3 -2
- app/config.py +1 -1
- app/dependencies.py +2 -1
- app/models.py +2 -1
- app/services/db_storage_adapter.py +55 -0
.env
CHANGED
|
@@ -1,11 +1,23 @@
|
|
| 1 |
-
# Database
|
| 2 |
-
DATABASE_URL=postgresql://
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
QDRANT_PORT=6333
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
# MinIO
|
| 9 |
MINIO_ENDPOINT=localhost:9000
|
| 10 |
MINIO_ACCESS_KEY=minioadmin
|
| 11 |
MINIO_SECRET_KEY=minioadmin
|
|
@@ -23,7 +35,8 @@ GROQ_API_KEY=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
|
|
| 23 |
GROQ_MODEL=llama-3.1-70b-versatile
|
| 24 |
|
| 25 |
# Optional - defaults shown
|
| 26 |
-
EMBEDDING_MODEL=
|
|
|
|
| 27 |
CHUNK_SIZE=500
|
| 28 |
CHUNK_OVERLAP=100
|
| 29 |
TOP_K_RETRIEVAL=5
|
|
|
|
| 1 |
+
# Database (Neon)
|
| 2 |
+
DATABASE_URL=postgresql://neondb_owner:npg_UP5jVumvACX9@ep-withered-field-am1worli-pooler.c-5.us-east-1.aws.neon.tech/neondb?sslmode=require
|
| 3 |
|
| 4 |
+
# Vector Database - Choose one
|
| 5 |
+
USE_PINECONE=true
|
| 6 |
+
|
| 7 |
+
# Pinecone (ACTIVE)
|
| 8 |
+
PINECONE_API_KEY=pcsk_32VVpv_5fkVJJwqKXjZYVpqU8vhmzU4NCRRpQg8vv7FVcP6DjmC6vwd5AdupQan3GXBPuv
|
| 9 |
+
PINECONE_ENVIRONMENT=us-east-1
|
| 10 |
+
PINECONE_INDEX_NAME=ragora
|
| 11 |
+
|
| 12 |
+
# Qdrant Cloud (Alternative)
|
| 13 |
+
QDRANT_HOST=your-cluster.aws.cloud.qdrant.io
|
| 14 |
QDRANT_PORT=6333
|
| 15 |
+
QDRANT_API_KEY=your_qdrant_api_key_here
|
| 16 |
+
|
| 17 |
+
# Storage - Use DB storage (files stored in Neon PostgreSQL)
|
| 18 |
+
USE_HF_STORAGE=false
|
| 19 |
|
| 20 |
+
# MinIO (Only for local development)
|
| 21 |
MINIO_ENDPOINT=localhost:9000
|
| 22 |
MINIO_ACCESS_KEY=minioadmin
|
| 23 |
MINIO_SECRET_KEY=minioadmin
|
|
|
|
| 35 |
GROQ_MODEL=llama-3.1-70b-versatile
|
| 36 |
|
| 37 |
# Optional - defaults shown
|
| 38 |
+
EMBEDDING_MODEL=BAAI/bge-m3
|
| 39 |
+
EMBEDDING_DIMENSION=1024
|
| 40 |
CHUNK_SIZE=500
|
| 41 |
CHUNK_OVERLAP=100
|
| 42 |
TOP_K_RETRIEVAL=5
|
app/application/document_service.py
CHANGED
|
@@ -51,18 +51,19 @@ class DocumentService:
|
|
| 51 |
file.seek(0)
|
| 52 |
text = await self.processor.extract_text(file, filename)
|
| 53 |
|
| 54 |
-
# 3. Upload to storage
|
| 55 |
file.seek(0)
|
| 56 |
file_data = file.read()
|
| 57 |
storage_key = f"{user.org_id}/{uuid.uuid4()}_{filename}"
|
| 58 |
content_type = self._get_content_type(filename)
|
| 59 |
await self.storage.upload(storage_key, file_data, content_type)
|
| 60 |
|
| 61 |
-
# 4. Create document record
|
| 62 |
document = Document(
|
| 63 |
name=filename,
|
| 64 |
size=file_size,
|
| 65 |
storage_path=storage_key,
|
|
|
|
| 66 |
user_id=user.id,
|
| 67 |
org_id=user.org_id,
|
| 68 |
folder_id=folder_id
|
|
|
|
| 51 |
file.seek(0)
|
| 52 |
text = await self.processor.extract_text(file, filename)
|
| 53 |
|
| 54 |
+
# 3. Upload to storage (cache)
|
| 55 |
file.seek(0)
|
| 56 |
file_data = file.read()
|
| 57 |
storage_key = f"{user.org_id}/{uuid.uuid4()}_{filename}"
|
| 58 |
content_type = self._get_content_type(filename)
|
| 59 |
await self.storage.upload(storage_key, file_data, content_type)
|
| 60 |
|
| 61 |
+
# 4. Create document record (persist file content in DB)
|
| 62 |
document = Document(
|
| 63 |
name=filename,
|
| 64 |
size=file_size,
|
| 65 |
storage_path=storage_key,
|
| 66 |
+
file_content=file_data, # Store in Neon PostgreSQL
|
| 67 |
user_id=user.id,
|
| 68 |
org_id=user.org_id,
|
| 69 |
folder_id=folder_id
|
app/config.py
CHANGED
|
@@ -32,7 +32,7 @@ class Settings(BaseSettings):
|
|
| 32 |
MINIO_BUCKET: str = "ragora-documents"
|
| 33 |
MINIO_SECURE: bool = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
| 34 |
|
| 35 |
-
#
|
| 36 |
USE_HF_STORAGE: bool = os.getenv("USE_HF_STORAGE", "false").lower() == "true"
|
| 37 |
HF_BUCKET_PATH: str = os.getenv("HF_BUCKET_PATH", "hf://buckets/Peterase/Ragora-doc-store")
|
| 38 |
|
|
|
|
| 32 |
MINIO_BUCKET: str = "ragora-documents"
|
| 33 |
MINIO_SECURE: bool = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
| 34 |
|
| 35 |
+
# Storage - Use DB storage by default (no external service needed)
|
| 36 |
USE_HF_STORAGE: bool = os.getenv("USE_HF_STORAGE", "false").lower() == "true"
|
| 37 |
HF_BUCKET_PATH: str = os.getenv("HF_BUCKET_PATH", "hf://buckets/Peterase/Ragora-doc-store")
|
| 38 |
|
app/dependencies.py
CHANGED
|
@@ -9,6 +9,7 @@ from app.ports.vector_db import VectorDBPort
|
|
| 9 |
from app.ports.llm import LLMPort
|
| 10 |
from app.services.storage_adapter import MinIOStorageAdapter
|
| 11 |
from app.services.hf_storage_adapter import HFStorageAdapter
|
|
|
|
| 12 |
from app.services.document_processor_adapter import DocumentProcessorAdapter
|
| 13 |
from app.services.embedder_adapter import SentenceTransformerAdapter
|
| 14 |
from app.services.vector_db_adapter import QdrantAdapter
|
|
@@ -28,7 +29,7 @@ settings = get_settings()
|
|
| 28 |
def get_storage_port() -> StoragePort:
|
| 29 |
if settings.USE_HF_STORAGE:
|
| 30 |
return HFStorageAdapter()
|
| 31 |
-
return
|
| 32 |
|
| 33 |
|
| 34 |
@lru_cache()
|
|
|
|
| 9 |
from app.ports.llm import LLMPort
|
| 10 |
from app.services.storage_adapter import MinIOStorageAdapter
|
| 11 |
from app.services.hf_storage_adapter import HFStorageAdapter
|
| 12 |
+
from app.services.db_storage_adapter import DatabaseStorageAdapter
|
| 13 |
from app.services.document_processor_adapter import DocumentProcessorAdapter
|
| 14 |
from app.services.embedder_adapter import SentenceTransformerAdapter
|
| 15 |
from app.services.vector_db_adapter import QdrantAdapter
|
|
|
|
| 29 |
def get_storage_port() -> StoragePort:
|
| 30 |
if settings.USE_HF_STORAGE:
|
| 31 |
return HFStorageAdapter()
|
| 32 |
+
return DatabaseStorageAdapter()
|
| 33 |
|
| 34 |
|
| 35 |
@lru_cache()
|
app/models.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""Domain models - SQLAlchemy entities."""
|
| 2 |
-
from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, Float
|
| 3 |
from sqlalchemy.ext.declarative import declarative_base
|
| 4 |
from sqlalchemy.orm import relationship
|
| 5 |
from datetime import datetime
|
|
@@ -43,6 +43,7 @@ class Document(Base):
|
|
| 43 |
name = Column(String, nullable=False)
|
| 44 |
size = Column(Integer, nullable=False)
|
| 45 |
storage_path = Column(String, nullable=False)
|
|
|
|
| 46 |
chunks = Column(Integer, default=0)
|
| 47 |
folder_id = Column(String, ForeignKey("folders.id"), nullable=True)
|
| 48 |
user_id = Column(String, ForeignKey("users.id"), nullable=False)
|
|
|
|
| 1 |
"""Domain models - SQLAlchemy entities."""
|
| 2 |
+
from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, Float, LargeBinary
|
| 3 |
from sqlalchemy.ext.declarative import declarative_base
|
| 4 |
from sqlalchemy.orm import relationship
|
| 5 |
from datetime import datetime
|
|
|
|
| 43 |
name = Column(String, nullable=False)
|
| 44 |
size = Column(Integer, nullable=False)
|
| 45 |
storage_path = Column(String, nullable=False)
|
| 46 |
+
file_content = Column(LargeBinary, nullable=True) # Store file in DB
|
| 47 |
chunks = Column(Integer, default=0)
|
| 48 |
folder_id = Column(String, ForeignKey("folders.id"), nullable=True)
|
| 49 |
user_id = Column(String, ForeignKey("users.id"), nullable=False)
|
app/services/db_storage_adapter.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Database storage adapter - stores files in PostgreSQL (Neon)."""
|
| 2 |
+
from app.ports.storage import StoragePort
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
# In-memory store keyed by storage_path -> bytes
|
| 8 |
+
# This works because document_service stores the file THEN immediately reads it back
|
| 9 |
+
# For download, we fetch from the documents table directly
|
| 10 |
+
_file_cache: dict[str, bytes] = {}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class DatabaseStorageAdapter(StoragePort):
|
| 14 |
+
"""Stores file bytes in an in-memory cache during processing.
|
| 15 |
+
|
| 16 |
+
The actual file content is persisted in the Document.file_content column
|
| 17 |
+
by the document_service after upload. Downloads are served from there.
|
| 18 |
+
This adapter bridges the gap between the StoragePort interface and the DB.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
async def upload(self, key: str, data: bytes, content_type: str) -> str:
|
| 22 |
+
"""Cache file bytes under the given key."""
|
| 23 |
+
_file_cache[key] = data
|
| 24 |
+
logger.info(f"Cached file for key: {key} ({len(data)} bytes)")
|
| 25 |
+
return key
|
| 26 |
+
|
| 27 |
+
async def download(self, key: str) -> bytes:
|
| 28 |
+
"""Return cached bytes, or look up from DB if not cached."""
|
| 29 |
+
if key in _file_cache:
|
| 30 |
+
return _file_cache[key]
|
| 31 |
+
|
| 32 |
+
# Fall back to DB lookup
|
| 33 |
+
try:
|
| 34 |
+
from app.database import SessionLocal
|
| 35 |
+
from app.models import Document
|
| 36 |
+
db = SessionLocal()
|
| 37 |
+
try:
|
| 38 |
+
doc = db.query(Document).filter(Document.storage_path == key).first()
|
| 39 |
+
if doc and doc.file_content:
|
| 40 |
+
return doc.file_content
|
| 41 |
+
finally:
|
| 42 |
+
db.close()
|
| 43 |
+
except Exception as e:
|
| 44 |
+
logger.error(f"DB fallback download failed for {key}: {e}")
|
| 45 |
+
|
| 46 |
+
raise FileNotFoundError(f"File not found: {key}")
|
| 47 |
+
|
| 48 |
+
async def delete(self, key: str) -> None:
|
| 49 |
+
"""Remove from cache."""
|
| 50 |
+
_file_cache.pop(key, None)
|
| 51 |
+
logger.info(f"Removed cached file: {key}")
|
| 52 |
+
|
| 53 |
+
async def get_presigned_url(self, key: str, expires: int = 3600) -> str:
|
| 54 |
+
"""Return a relative download URL handled by the API."""
|
| 55 |
+
return f"/documents/download/{key}"
|