File size: 3,791 Bytes
c024705
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os, uuid, json
from pathlib import Path
# Replace ollama import with OpenAI client
from openai import OpenAI
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()

DATA_DIR = Path("data")
EMBED_FILE = Path("storage/embeddings.json")
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434/v1")
OLLAMA_API_KEY = os.getenv("OLLAMA_API_KEY", "ollama")

# Initialize OpenAI client for Ollama
openai_client = OpenAI(
    base_url=OLLAMA_BASE_URL,
    api_key=OLLAMA_API_KEY
)

# --- Load or initialize embeddings ---
if EMBED_FILE.exists():
    with open(EMBED_FILE, "r", encoding="utf-8") as f:
        chunks_data = json.load(f)
else:
    chunks_data = []

# --- Helper functions ---
def load_text_from_file(path: Path) -> str:
    if path.suffix.lower() in [".txt", ".md"]:
        return path.read_text(encoding="utf-8", errors="ignore")
    if path.suffix.lower() == ".pdf":
        pdf = PdfReader(str(path))
        return "\n".join((page.extract_text() or "") for page in pdf.pages)
    return ""

def chunk_text(text: str):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=900, chunk_overlap=150,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_text(text)

# --- Track existing sources ---
existing_files = {c["source"] for c in chunks_data}

new_chunks = []
for fp in DATA_DIR.glob("**/*"):
    if fp.suffix.lower() not in [".pdf", ".txt", ".md"]:
        continue
    if fp.name in existing_files:
        continue  # skip already processed files

    raw = load_text_from_file(fp)
    if not raw.strip():
        continue

    for idx, piece in enumerate(chunk_text(raw)):
        new_chunks.append({
            "id": str(uuid.uuid4()),
            "text": piece,
            "source": fp.name,
            "chunk": idx,
            "embedding": None  # to fill below
        })

# --- Generate embeddings with OpenAI client ---
if new_chunks:
    texts = [c["text"] for c in new_chunks]
    
    # Generate embeddings using OpenAI client
    embeddings = []
    batch_size = 32  # Process in batches for better performance
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        try:
            # OpenAI client supports batch processing
            response = openai_client.embeddings.create(
                model=EMBED_MODEL,
                input=batch
            )
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)
            print(f"Processed batch {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}")
        except Exception as e:
            print(f"Error embedding batch: {e}")
            # Fallback: process individually
            for text in batch:
                try:
                    response = openai_client.embeddings.create(
                        model=EMBED_MODEL,
                        input=text
                    )
                    embeddings.append(response.data[0].embedding)
                except Exception as e2:
                    print(f"Error embedding individual text: {e2}")
                    embeddings.append([0.0] * 384)  # fallback with correct dimension
    
    for c, e in zip(new_chunks, embeddings):
        c["embedding"] = e

    chunks_data.extend(new_chunks)

    # Save updated embeddings
    EMBED_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(EMBED_FILE, "w", encoding="utf-8") as f:
        json.dump(chunks_data, f, ensure_ascii=False, indent=2)

    print(f"Added {len(new_chunks)} new chunks to {EMBED_FILE}")
else:
    print("No new documents found.")