| import os |
| from io import BytesIO |
| import PyPDF2 |
| import docx |
| from django.core.files.storage import default_storage |
| from .engine import engine |
| from .models import KnowledgeDocument |
|
|
| |
| def extract_text_from_file(file_path): |
| ext = os.path.splitext(file_path)[1].lower() |
| text = "" |
| if ext == '.pdf': |
| with open(file_path, 'rb') as f: |
| reader = PyPDF2.PdfReader(f) |
| for page in reader.pages: |
| text += page.extract_text() + "\n" |
| elif ext in ['.doc', '.docx']: |
| doc = docx.Document(file_path) |
| for para in doc.paragraphs: |
| text += para.text + "\n" |
| elif ext == '.txt': |
| with open(file_path, 'r', encoding='utf-8') as f: |
| text = f.read() |
| return text |
|
|
| def chunk_text(text, chunk_size=1000, overlap=200): |
| chunks = [] |
| start = 0 |
| while start < len(text): |
| end = start + chunk_size |
| chunks.append(text[start:end]) |
| start = end - overlap |
| return chunks |
|
|
| def ingest_document(doc_id): |
| doc = KnowledgeDocument.objects.get(id=doc_id) |
| doc.status = 'processing' |
| doc.save() |
| |
| file_path = doc.file.path |
| try: |
| text = extract_text_from_file(file_path) |
| chunks = chunk_text(text) |
| |
| |
| try: |
| engine.qdrant.get_collection("knowledge_base") |
| except: |
| from qdrant_client.models import Distance, VectorParams |
| engine.qdrant.create_collection( |
| collection_name="knowledge_base", |
| vectors_config=VectorParams(size=384, distance=Distance.COSINE) |
| ) |
| |
| vectors = engine.model.encode(chunks).tolist() |
| payloads = [{"text": chunk, "doc_id": doc_id, "title": doc.title} for chunk in chunks] |
| |
| import uuid |
| from qdrant_client.models import PointStruct |
| |
| points = [ |
| PointStruct(id=str(uuid.uuid4()), vector=v, payload=p) |
| for v, p in zip(vectors, payloads) |
| ] |
| |
| engine.qdrant.upsert( |
| collection_name="knowledge_base", |
| points=points |
| ) |
| |
| doc.chunks_count = len(chunks) |
| doc.status = 'indexed' |
| doc.save() |
| |
| except Exception as e: |
| print(f"Error ingesting doc {doc_id}: {e}") |
| doc.status = 'error' |
| doc.save() |
|
|