File size: 4,502 Bytes
78fefe8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import fitz
import nltk
import json
import faiss
import numpy as np
from openai import OpenAI
import time
nltk.download("punkt")
PDF_FOLDER = "backend/app/sentiment/pds"
FAISS_INDEX_PATH = "backend/app/sentiment/faiss_index.idx"
METADATA_PATH = "backend/app/sentiment/metadata.json"
EMBEDDING_MODEL = "text-embedding-ada-002"
OVERLAP_TOKENS = 50
CHUNK_SIZE_TOKENS = 200
OPENAI_API_KEY='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'
client = OpenAI(api_key=OPENAI_API_KEY)
def extract_text_by_page(pdf_path):
"""Extract text from each page of PDF separately (for metadata)."""
doc = fitz.open(pdf_path)
pages_text = []
for page in doc:
text = page.get_text()
pages_text.append(text)
return pages_text
def tokenize_text(text):
"""Tokenize text into tokens using nltk sentence tokenizer + split."""
sentences = nltk.sent_tokenize(text)
tokens = []
for sentence in sentences:
tokens.extend(sentence.split())
return tokens
def detokenize_tokens(tokens):
"""Convert tokens back to text."""
return " ".join(tokens)
def chunk_tokens(tokens, chunk_size=CHUNK_SIZE_TOKENS, overlap=OVERLAP_TOKENS):
"""Chunk tokens into overlapping chunks."""
chunks = []
start = 0
while start < len(tokens):
end = start + chunk_size
chunk = tokens[start:end]
chunks.append(chunk)
if end >= len(tokens):
break
start = end - overlap
return chunks
def get_embedding(text):
response = client.embeddings.create(
input=text,
model="text-embedding-3-large"
)
return response.data[0].embedding
def build_index_and_save():
all_embeddings = []
metadata = []
print("Reading PDFs and chunking text...")
for filename in os.listdir(PDF_FOLDER):
if not filename.lower().endswith(".pdf"):
continue
pdf_path = os.path.join(PDF_FOLDER, filename)
print(f"Processing {filename}")
pages = extract_text_by_page(pdf_path)
for page_num, page_text in enumerate(pages):
page_text = page_text.lower().strip()
tokens = tokenize_text(page_text)
chunks_tokens = chunk_tokens(tokens)
for i, chunk_tokens_ in enumerate(chunks_tokens):
chunk_text = detokenize_tokens(chunk_tokens_)
chunk_text = chunk_text.lower()
embedding = get_embedding(chunk_text)
all_embeddings.append(embedding)
metadata.append({
"source_pdf": filename,
"page": page_num,
"chunk_index": i,
"text": chunk_text[:500]
})
if len(all_embeddings) % 50 == 0:
save_index_and_metadata(all_embeddings, metadata)
save_index_and_metadata(all_embeddings, metadata)
print("Index build completed.")
def save_index_and_metadata(embeddings, metadata):
dimension = len(embeddings[0])
print(f"Saving index with {len(embeddings)} vectors...")
embeddings_np = np.array(embeddings).astype("float32")
faiss.normalize_L2(embeddings_np)
index = faiss.IndexFlatIP(dimension)
index.add(embeddings_np)
faiss.write_index(index, FAISS_INDEX_PATH)
with open(METADATA_PATH, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
def load_index_and_metadata():
index = faiss.read_index(FAISS_INDEX_PATH)
with open(METADATA_PATH, "r", encoding="utf-8") as f:
metadata = json.load(f)
return index, metadata
def query_index(query, top_k=5):
index, metadata = load_index_and_metadata()
query = query.lower()
query_embedding = get_embedding(query)
query_embedding_np = np.array([query_embedding]).astype("float32")
faiss.normalize_L2(query_embedding_np)
distances, indices = index.search(query_embedding_np, top_k)
results = []
for dist, idx in zip(distances[0], indices[0]):
meta = metadata[idx]
results.append({
"score": float(dist),
"source_pdf": meta["source_pdf"],
"page": meta["page"],
"chunk_index": meta["chunk_index"],
"text_snippet": meta["text"]
})
return results
if __name__ == "__main__":
build_index_and_save()
print("\nQuery results:")
|