Peterase commited on
Commit
73970da
·
1 Parent(s): 70dc6c7

fix: replace HF bucket storage with DB storage (files stored in Neon PostgreSQL)

Browse files
.env CHANGED
@@ -1,11 +1,23 @@
1
- # Database
2
- DATABASE_URL=postgresql://postgres:postgres@localhost:5432/ragora
3
 
4
- # Qdrant
5
- QDRANT_HOST=localhost
 
 
 
 
 
 
 
 
6
  QDRANT_PORT=6333
 
 
 
 
7
 
8
- # MinIO
9
  MINIO_ENDPOINT=localhost:9000
10
  MINIO_ACCESS_KEY=minioadmin
11
  MINIO_SECRET_KEY=minioadmin
@@ -23,7 +35,8 @@ GROQ_API_KEY=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
23
  GROQ_MODEL=llama-3.1-70b-versatile
24
 
25
  # Optional - defaults shown
26
- EMBEDDING_MODEL=all-MiniLM-L6-v2
 
27
  CHUNK_SIZE=500
28
  CHUNK_OVERLAP=100
29
  TOP_K_RETRIEVAL=5
 
1
+ # Database (Neon)
2
+ DATABASE_URL=postgresql://neondb_owner:npg_UP5jVumvACX9@ep-withered-field-am1worli-pooler.c-5.us-east-1.aws.neon.tech/neondb?sslmode=require
3
 
4
+ # Vector Database - Choose one
5
+ USE_PINECONE=true
6
+
7
+ # Pinecone (ACTIVE)
8
+ PINECONE_API_KEY=pcsk_32VVpv_5fkVJJwqKXjZYVpqU8vhmzU4NCRRpQg8vv7FVcP6DjmC6vwd5AdupQan3GXBPuv
9
+ PINECONE_ENVIRONMENT=us-east-1
10
+ PINECONE_INDEX_NAME=ragora
11
+
12
+ # Qdrant Cloud (Alternative)
13
+ QDRANT_HOST=your-cluster.aws.cloud.qdrant.io
14
  QDRANT_PORT=6333
15
+ QDRANT_API_KEY=your_qdrant_api_key_here
16
+
17
+ # Storage - Use DB storage (files stored in Neon PostgreSQL)
18
+ USE_HF_STORAGE=false
19
 
20
+ # MinIO (Only for local development)
21
  MINIO_ENDPOINT=localhost:9000
22
  MINIO_ACCESS_KEY=minioadmin
23
  MINIO_SECRET_KEY=minioadmin
 
35
  GROQ_MODEL=llama-3.1-70b-versatile
36
 
37
  # Optional - defaults shown
38
+ EMBEDDING_MODEL=BAAI/bge-m3
39
+ EMBEDDING_DIMENSION=1024
40
  CHUNK_SIZE=500
41
  CHUNK_OVERLAP=100
42
  TOP_K_RETRIEVAL=5
app/application/document_service.py CHANGED
@@ -51,18 +51,19 @@ class DocumentService:
51
  file.seek(0)
52
  text = await self.processor.extract_text(file, filename)
53
 
54
- # 3. Upload to storage
55
  file.seek(0)
56
  file_data = file.read()
57
  storage_key = f"{user.org_id}/{uuid.uuid4()}_{filename}"
58
  content_type = self._get_content_type(filename)
59
  await self.storage.upload(storage_key, file_data, content_type)
60
 
61
- # 4. Create document record
62
  document = Document(
63
  name=filename,
64
  size=file_size,
65
  storage_path=storage_key,
 
66
  user_id=user.id,
67
  org_id=user.org_id,
68
  folder_id=folder_id
 
51
  file.seek(0)
52
  text = await self.processor.extract_text(file, filename)
53
 
54
+ # 3. Upload to storage (cache)
55
  file.seek(0)
56
  file_data = file.read()
57
  storage_key = f"{user.org_id}/{uuid.uuid4()}_{filename}"
58
  content_type = self._get_content_type(filename)
59
  await self.storage.upload(storage_key, file_data, content_type)
60
 
61
+ # 4. Create document record (persist file content in DB)
62
  document = Document(
63
  name=filename,
64
  size=file_size,
65
  storage_path=storage_key,
66
+ file_content=file_data, # Store in Neon PostgreSQL
67
  user_id=user.id,
68
  org_id=user.org_id,
69
  folder_id=folder_id
app/config.py CHANGED
@@ -32,7 +32,7 @@ class Settings(BaseSettings):
32
  MINIO_BUCKET: str = "ragora-documents"
33
  MINIO_SECURE: bool = os.getenv("MINIO_SECURE", "false").lower() == "true"
34
 
35
- # Hugging Face Storage
36
  USE_HF_STORAGE: bool = os.getenv("USE_HF_STORAGE", "false").lower() == "true"
37
  HF_BUCKET_PATH: str = os.getenv("HF_BUCKET_PATH", "hf://buckets/Peterase/Ragora-doc-store")
38
 
 
32
  MINIO_BUCKET: str = "ragora-documents"
33
  MINIO_SECURE: bool = os.getenv("MINIO_SECURE", "false").lower() == "true"
34
 
35
+ # Storage - Use DB storage by default (no external service needed)
36
  USE_HF_STORAGE: bool = os.getenv("USE_HF_STORAGE", "false").lower() == "true"
37
  HF_BUCKET_PATH: str = os.getenv("HF_BUCKET_PATH", "hf://buckets/Peterase/Ragora-doc-store")
38
 
app/dependencies.py CHANGED
@@ -9,6 +9,7 @@ from app.ports.vector_db import VectorDBPort
9
  from app.ports.llm import LLMPort
10
  from app.services.storage_adapter import MinIOStorageAdapter
11
  from app.services.hf_storage_adapter import HFStorageAdapter
 
12
  from app.services.document_processor_adapter import DocumentProcessorAdapter
13
  from app.services.embedder_adapter import SentenceTransformerAdapter
14
  from app.services.vector_db_adapter import QdrantAdapter
@@ -28,7 +29,7 @@ settings = get_settings()
28
  def get_storage_port() -> StoragePort:
29
  if settings.USE_HF_STORAGE:
30
  return HFStorageAdapter()
31
- return MinIOStorageAdapter()
32
 
33
 
34
  @lru_cache()
 
9
  from app.ports.llm import LLMPort
10
  from app.services.storage_adapter import MinIOStorageAdapter
11
  from app.services.hf_storage_adapter import HFStorageAdapter
12
+ from app.services.db_storage_adapter import DatabaseStorageAdapter
13
  from app.services.document_processor_adapter import DocumentProcessorAdapter
14
  from app.services.embedder_adapter import SentenceTransformerAdapter
15
  from app.services.vector_db_adapter import QdrantAdapter
 
29
  def get_storage_port() -> StoragePort:
30
  if settings.USE_HF_STORAGE:
31
  return HFStorageAdapter()
32
+ return DatabaseStorageAdapter()
33
 
34
 
35
  @lru_cache()
app/models.py CHANGED
@@ -1,5 +1,5 @@
1
  """Domain models - SQLAlchemy entities."""
2
- from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, Float
3
  from sqlalchemy.ext.declarative import declarative_base
4
  from sqlalchemy.orm import relationship
5
  from datetime import datetime
@@ -43,6 +43,7 @@ class Document(Base):
43
  name = Column(String, nullable=False)
44
  size = Column(Integer, nullable=False)
45
  storage_path = Column(String, nullable=False)
 
46
  chunks = Column(Integer, default=0)
47
  folder_id = Column(String, ForeignKey("folders.id"), nullable=True)
48
  user_id = Column(String, ForeignKey("users.id"), nullable=False)
 
1
  """Domain models - SQLAlchemy entities."""
2
+ from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, Float, LargeBinary
3
  from sqlalchemy.ext.declarative import declarative_base
4
  from sqlalchemy.orm import relationship
5
  from datetime import datetime
 
43
  name = Column(String, nullable=False)
44
  size = Column(Integer, nullable=False)
45
  storage_path = Column(String, nullable=False)
46
+ file_content = Column(LargeBinary, nullable=True) # Store file in DB
47
  chunks = Column(Integer, default=0)
48
  folder_id = Column(String, ForeignKey("folders.id"), nullable=True)
49
  user_id = Column(String, ForeignKey("users.id"), nullable=False)
app/services/db_storage_adapter.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Database storage adapter - stores files in PostgreSQL (Neon)."""
2
+ from app.ports.storage import StoragePort
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ # In-memory store keyed by storage_path -> bytes
8
+ # This works because document_service stores the file THEN immediately reads it back
9
+ # For download, we fetch from the documents table directly
10
+ _file_cache: dict[str, bytes] = {}
11
+
12
+
13
+ class DatabaseStorageAdapter(StoragePort):
14
+ """Stores file bytes in an in-memory cache during processing.
15
+
16
+ The actual file content is persisted in the Document.file_content column
17
+ by the document_service after upload. Downloads are served from there.
18
+ This adapter bridges the gap between the StoragePort interface and the DB.
19
+ """
20
+
21
+ async def upload(self, key: str, data: bytes, content_type: str) -> str:
22
+ """Cache file bytes under the given key."""
23
+ _file_cache[key] = data
24
+ logger.info(f"Cached file for key: {key} ({len(data)} bytes)")
25
+ return key
26
+
27
+ async def download(self, key: str) -> bytes:
28
+ """Return cached bytes, or look up from DB if not cached."""
29
+ if key in _file_cache:
30
+ return _file_cache[key]
31
+
32
+ # Fall back to DB lookup
33
+ try:
34
+ from app.database import SessionLocal
35
+ from app.models import Document
36
+ db = SessionLocal()
37
+ try:
38
+ doc = db.query(Document).filter(Document.storage_path == key).first()
39
+ if doc and doc.file_content:
40
+ return doc.file_content
41
+ finally:
42
+ db.close()
43
+ except Exception as e:
44
+ logger.error(f"DB fallback download failed for {key}: {e}")
45
+
46
+ raise FileNotFoundError(f"File not found: {key}")
47
+
48
+ async def delete(self, key: str) -> None:
49
+ """Remove from cache."""
50
+ _file_cache.pop(key, None)
51
+ logger.info(f"Removed cached file: {key}")
52
+
53
+ async def get_presigned_url(self, key: str, expires: int = 3600) -> str:
54
+ """Return a relative download URL handled by the API."""
55
+ return f"/documents/download/{key}"