Spaces:
Runtime error
Runtime error
File size: 1,973 Bytes
382c248 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import os
import re
import pickle
import faiss
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
POST_DIR = os.path.join(ROOT, "posts")
INDEX_PATH = os.path.join(ROOT, "faiss_index.bin")
META_PATH = os.path.join(ROOT, "faiss_meta.pkl")
def split_into_chunks(text, chunk_size=4):
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
for i in range(0, len(sentences), chunk_size):
chunk = " ".join(sentences[i:i + chunk_size]).strip()
if chunk:
chunks.append(chunk)
return chunks
def load_html_posts(folder):
texts = []
ids = []
meta = []
for filename in os.listdir(folder):
if not filename.endswith(".html"):
continue
path = os.path.join(folder, filename)
with open(path, "r", encoding="utf-8") as f:
raw_html = f.read()
soup = BeautifulSoup(raw_html, "html.parser")
cleaned = soup.get_text(separator="\n")
chunks = split_into_chunks(cleaned, chunk_size=4)
for i, chunk in enumerate(chunks):
texts.append(chunk)
ids.append(f"{filename}_{i}")
meta.append({"source": filename, "chunk": i})
return texts, ids, meta
def main():
print("Loading posts...")
texts, ids, meta = load_html_posts(POST_DIR)
if not texts:
print("No data found.")
return
print(f"Loaded {len(texts)} chunks. Embedding now...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings.astype("float32"))
print("Saving FAISS index and metadata...")
faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "wb") as f:
pickle.dump((texts, ids, meta), f)
print("Done.")
if __name__ == "__main__":
main()
|