File size: 5,436 Bytes
249a397
 
 
 
 
102dac3
249a397
102dac3
 
249a397
102dac3
 
 
 
 
249a397
 
 
 
 
 
 
 
102dac3
249a397
 
102dac3
 
249a397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102dac3
 
 
 
 
 
 
 
 
 
 
 
 
249a397
 
 
 
 
 
 
 
 
 
 
 
 
102dac3
249a397
 
 
 
 
 
102dac3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import re
import json
import hashlib
from pathlib import Path
import sys

# Ensure we can import from backend if running from root
sys.path.append(str(Path(__file__).resolve().parent))

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from backend import config

# === UTILS ===
def hash_text(text: str) -> str:
    return hashlib.md5(text.encode()).hexdigest()[:8]


# === MAIN FUNCTION ===
def create_faiss_store(
    md_dir: str = str(config.PERSONAL_DATA_DIR),
    chunk_size: int = 1000,
    chunk_overlap: int = 250,
    persist_dir: str = str(config.FAISS_PATH.parent), # Save to parent of specific version
    chunk_save_path: str = str(config.CHUNKS_PATH),
    min_chunk_chars: int = 50,
):
    """
    Reads all .md files in md_dir, splits into chunks, saves chunks to JSON,
    and builds a FAISS index with HuggingFace embeddings.
    """

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n# ", "\n## ", "\n### ", "\n#### ", "\n\n", "\n- ", "\n", ". ", " "],
        keep_separator=True,
        length_function=len,  # consider tokenizer-based later
        is_separator_regex=False,
    )

    docs, all_chunks, failed_chunks = [], [], []

    # Gather markdown files
    md_files = list(Path(md_dir).glob("*.md"))
    if not md_files:
        print(f"⚠️ No markdown files found in: {md_dir}")
    for md_file in md_files:
        try:
            with open(md_file, "r", encoding="utf-8") as f:
                content = f.read().strip()
        except Exception as e:
            print(f"❌ Failed to read {md_file}: {e}")
            continue

        if not content:
            continue

        # NON-DESTRUCTIVE: only insert a space after hashes when missing
        # Keeps heading level (##, ###, etc.) and full text
        content = re.sub(r'\n(#+)(\S)', r'\n\1 \2', content)

        docs.append(
            {
                "content": content,
                "metadata": {
                    "source": md_file.name,
                    "header": content.split("\n")[0] if "\n" in content else content,
                },
            }
        )

    # Split into chunks and keep them (no LLM enrichment)
    for doc in docs:
        try:
            chunks = splitter.split_text(doc["content"])
        except Exception as e:
            print(f"❌ Error splitting {doc['metadata']['source']}: {e}")
            continue

        for i, chunk in enumerate(chunks):
            chunk = chunk.strip()
            if len(chunk) < min_chunk_chars:
                continue

            chunk_id = f"{doc['metadata']['source']}_#{i}_{hash_text(chunk)}"
            metadata = {
                **doc["metadata"],
                "chunk_id": chunk_id,
                "has_header": chunk.startswith("#"),
                "word_count": len(chunk.split()),
            }
            header = doc["metadata"]["header"]
            chunk = f"[HEADER] {header}\n\n{chunk}" 
            # Keep raw chunk (no summaries / questions)
            all_chunks.append({"text": chunk, "metadata": metadata})

    print(f"βœ… Markdown files processed: {len(docs)}")
    print(f"βœ… Chunks created: {len(all_chunks)} | ⚠️ Failed: {len(failed_chunks)}")

    # Ensure output dir exists and save raw chunks JSON
    os.makedirs(os.path.dirname(chunk_save_path), exist_ok=True)
    with open(chunk_save_path, "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, indent=2, ensure_ascii=False)
    print(f"πŸ“ Saved chunks β†’ {chunk_save_path}")

    # If nothing to index, stop here
    if not all_chunks:
        print("⚠️ No chunks to index. Skipping FAISS build.")
        return

    # Prepare FAISS save path
    os.makedirs(persist_dir, exist_ok=True)
    version_tag = f"v{len(all_chunks)}_{chunk_size}-{chunk_overlap}"
    save_path = os.path.join(persist_dir, version_tag)
    os.makedirs(save_path, exist_ok=True)

    # Embeddings + FAISS
    if config.USE_OPENAI_EMBEDDING:
        print(f"πŸ”Ή Using OpenAI Embeddings: {config.EMBEDDING_MODEL_NAME}")
        embeddings = OpenAIEmbeddings(
            model=config.EMBEDDING_MODEL_NAME,
            openai_api_key=config.OPENAI_API_KEY
        )
    else:
        print(f"πŸ”Ή Using HuggingFace Embeddings: {config.EMBEDDING_MODEL_NAME}")
        embeddings = HuggingFaceEmbeddings(
            model_name=config.EMBEDDING_MODEL_NAME,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True},
        )

    vector_store = FAISS.from_texts(
        texts=[c["text"] for c in all_chunks],
        embedding=embeddings,
        metadatas=[c["metadata"] for c in all_chunks],
    )
    vector_store.save_local(save_path)

    print(f"βœ… FAISS index saved at: {save_path}")
    avg_len = sum(len(c["text"]) for c in all_chunks) / len(all_chunks)
    print(f"πŸ“Š Stats β†’ Chunks: {len(all_chunks)} | Avg length: {avg_len:.1f} characters")

    if failed_chunks:
        with open(config.FAILED_CHUNKS_PATH, "w", encoding="utf-8") as f:
            for line in failed_chunks:
                f.write(line + "\n")
        print("πŸ“ Failed chunk IDs saved to failed_chunks.txt")


if __name__ == "__main__":
    create_faiss_store()