testing / interview /knowledge_service.py
Danielsz's picture
deploy
16e8be2
Raw
History Blame Contribute Delete
2.51 kB
import os
from io import BytesIO
import PyPDF2
import docx
from django.core.files.storage import default_storage
from .engine import engine
from .models import KnowledgeDocument
# ponytail: simple text extractor for documents
def extract_text_from_file(file_path):
ext = os.path.splitext(file_path)[1].lower()
text = ""
if ext == '.pdf':
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text += page.extract_text() + "\n"
elif ext in ['.doc', '.docx']:
doc = docx.Document(file_path)
for para in doc.paragraphs:
text += para.text + "\n"
elif ext == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
def chunk_text(text, chunk_size=1000, overlap=200):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - overlap
return chunks
def ingest_document(doc_id):
doc = KnowledgeDocument.objects.get(id=doc_id)
doc.status = 'processing'
doc.save()
file_path = doc.file.path
try:
text = extract_text_from_file(file_path)
chunks = chunk_text(text)
# Ensure collection exists (ponytail: let's hope it does or recreate it here if we want to be safe)
try:
engine.qdrant.get_collection("knowledge_base")
except:
from qdrant_client.models import Distance, VectorParams
engine.qdrant.create_collection(
collection_name="knowledge_base",
vectors_config=VectorParams(size=384, distance=Distance.COSINE) # MiniLM size
)
vectors = engine.model.encode(chunks).tolist()
payloads = [{"text": chunk, "doc_id": doc_id, "title": doc.title} for chunk in chunks]
import uuid
from qdrant_client.models import PointStruct
points = [
PointStruct(id=str(uuid.uuid4()), vector=v, payload=p)
for v, p in zip(vectors, payloads)
]
engine.qdrant.upsert(
collection_name="knowledge_base",
points=points
)
doc.chunks_count = len(chunks)
doc.status = 'indexed'
doc.save()
except Exception as e:
print(f"Error ingesting doc {doc_id}: {e}")
doc.status = 'error'
doc.save()