Nguyen5 commited on
Commit
ed084d7
·
1 Parent(s): 2667021
Files changed (9) hide show
  1. app.py +184 -0
  2. embeddings.py +23 -0
  3. llm.py +31 -0
  4. load_documents.py +92 -0
  5. rag_pipeline.py +111 -0
  6. retriever.py +14 -0
  7. speech_io.py +100 -0
  8. split_documents.py +28 -0
  9. vectorstore.py +30 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py – SUPABASE RAG CHATBOT (Docker + Ollama)
2
+
3
+ import gradio as gr
4
+
5
+ from load_documents import load_documents, PDF_URL, HG_HTML_URL
6
+ from split_documents import split_documents
7
+ from vectorstore import build_vectorstore
8
+ from retriever import get_retriever
9
+ from llm import load_llm
10
+ from rag_pipeline import answer
11
+ from speech_io import transcribe_audio, synthesize_speech
12
+
13
+ # ================= INITIALISIERUNG =====================
14
+
15
+ print("🔹 Lade Dokumente aus Supabase …")
16
+ _docs = load_documents()
17
+
18
+ print("🔹 Splitte Dokumente …")
19
+ _chunks = split_documents(_docs)
20
+
21
+ print("🔹 Baue VectorStore …")
22
+ _vs = build_vectorstore(_chunks)
23
+
24
+ print("🔹 Erzeuge Retriever …")
25
+ _retriever = get_retriever(_vs)
26
+
27
+ print("🔹 Lade LLM (Ollama) …")
28
+ _llm = load_llm()
29
+
30
+
31
+ # ================= Quellen Markdown ====================
32
+
33
+ def format_sources_markdown(sources):
34
+ if not sources:
35
+ return ""
36
+
37
+ lines = ["", "### 📚 Quellen (verwendete Dokumentstellen):"]
38
+
39
+ for s in sources:
40
+ sid = s["id"]
41
+ src = s["source"]
42
+ page = s["page"]
43
+ url = s["url"]
44
+ snippet = s["snippet"]
45
+
46
+ if page:
47
+ title = f"Quelle {sid} – {src}, Seite {page}"
48
+ else:
49
+ title = f"Quelle {sid} – {src}"
50
+
51
+ if url:
52
+ base = f"- [{title}]({url})"
53
+ else:
54
+ base = f"- {title}"
55
+
56
+ lines.append(base)
57
+ if snippet:
58
+ lines.append(f" > {snippet}")
59
+
60
+ return "\n".join(lines)
61
+
62
+
63
+ # ================= TEXT CHATBOT ========================
64
+
65
+ def chatbot_text(user_message, history):
66
+ if not user_message:
67
+ return history, ""
68
+
69
+ answer_text, sources = answer(
70
+ question=user_message,
71
+ retriever=_retriever,
72
+ chat_model=_llm,
73
+ )
74
+
75
+ quellen_block = format_sources_markdown(sources)
76
+ bot_msg = answer_text + "\n\n" + quellen_block
77
+
78
+ history = history + [
79
+ {"role": "user", "content": user_message},
80
+ {"role": "assistant", "content": bot_msg},
81
+ ]
82
+
83
+ return history, ""
84
+
85
+
86
+ # ================= VOICE CHATBOT =======================
87
+
88
+ def chatbot_voice(audio_path, history):
89
+ text = transcribe_audio(audio_path)
90
+ if not text:
91
+ return history, None, ""
92
+
93
+ history = history + [{"role": "user", "content": text}]
94
+
95
+ answer_text, sources = answer(
96
+ question=text,
97
+ retriever=_retriever,
98
+ chat_model=_llm,
99
+ )
100
+
101
+ quellen_block = format_sources_markdown(sources)
102
+ bot_msg = answer_text + "\n\n" + quellen_block
103
+
104
+ history = history + [{"role": "assistant", "content": bot_msg}]
105
+
106
+ audio = synthesize_speech(bot_msg)
107
+ return history, audio, ""
108
+
109
+
110
+ def read_last_answer(history):
111
+ if not history:
112
+ return None
113
+ for msg in reversed(history):
114
+ if msg["role"] == "assistant":
115
+ return synthesize_speech(msg["content"])
116
+ return None
117
+
118
+
119
+ # ================= UI (Gradio) =========================
120
+
121
+ with gr.Blocks(title="Prüfungsrechts-Chatbot (Supabase + Ollama)") as demo:
122
+
123
+ gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot (Supabase RAG, Ollama)")
124
+ gr.Markdown("Fragen zum Prüfungsrecht? Text oder Mikrofon möglich.")
125
+
126
+ with gr.Row():
127
+
128
+ # ---------- CHAT ----------
129
+ with gr.Column(scale=2):
130
+ chatbot = gr.Chatbot(
131
+ type="messages",
132
+ label="Chat",
133
+ height=550,
134
+ )
135
+
136
+ msg = gr.Textbox(
137
+ label="Frage eingeben",
138
+ placeholder="Stelle deine Frage zum Prüfungsrecht …",
139
+ autofocus=True,
140
+ )
141
+ msg.submit(chatbot_text, [msg, chatbot], [chatbot, msg])
142
+
143
+ send_btn = gr.Button("Senden (Text)")
144
+ send_btn.click(chatbot_text, [msg, chatbot], [chatbot, msg])
145
+
146
+ gr.Markdown("### 🎙️ Spracheingabe")
147
+ voice_in = gr.Audio(sources=["microphone"], type="filepath")
148
+ voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
149
+
150
+ send_voice_btn = gr.Button("Sprechen & Senden")
151
+ send_voice_btn.click(
152
+ chatbot_voice,
153
+ [voice_in, chatbot],
154
+ [chatbot, voice_out, msg],
155
+ )
156
+
157
+ read_btn = gr.Button("Antwort erneut vorlesen")
158
+ read_btn.click(read_last_answer, [chatbot], [voice_out])
159
+
160
+ clear_btn = gr.Button("Chat löschen")
161
+ clear_btn.click(lambda: [], None, chatbot)
162
+
163
+ # ---------- VIEWER ----------
164
+ with gr.Column(scale=1):
165
+ gr.Markdown("### 📄 Prüfungsordnung (PDF)")
166
+ gr.HTML(
167
+ f"""
168
+ <iframe src="{PDF_URL}"
169
+ style="width:100%; height:330px; border:none;">
170
+ </iframe>
171
+ """
172
+ )
173
+
174
+ gr.Markdown("### 📘 Hochschulgesetz NRW (Paragraph-Viewer)")
175
+ gr.HTML(
176
+ f"""
177
+ <iframe src="{HG_HTML_URL}"
178
+ style="width:100%; height:330px; border:none;">
179
+ </iframe>
180
+ """
181
+ )
182
+
183
+ if __name__ == "__main__":
184
+ demo.queue().launch(ssr_mode=False, show_error=True)
embeddings.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BƯỚC 3: EMBEDDINGS – local & free
3
+ """
4
+
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+
7
+ EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
8
+
9
+
10
+ def get_embeddings():
11
+ print(f">>> Loading embedding model: {EMBEDDING_MODEL}")
12
+ embeddings = HuggingFaceEmbeddings(
13
+ model_name=EMBEDDING_MODEL,
14
+ model_kwargs={"device": "cpu"},
15
+ encode_kwargs={"normalize_embeddings": True},
16
+ )
17
+ print(">>> Embedding model loaded.\n")
18
+ return embeddings
19
+
20
+
21
+ if __name__ == "__main__":
22
+ emb = get_embeddings()
23
+ print(emb.embed_query("Test"))
llm.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ llm.py – LLM local über Ollama (z.B. qwen2.5:1.5b-instruct)
3
+ Kein HF Inference, komplett kostenlos.
4
+ """
5
+
6
+ from langchain_community.chat_models import ChatOllama
7
+
8
+
9
+ MODEL_NAME = "qwen2.5:1.5b-instruct"
10
+
11
+
12
+ def load_llm():
13
+ """
14
+ Erstellt ein ChatOllama-Modell, das auf den lokal laufenden
15
+ Ollama-Server (http://localhost:11434) zugreift.
16
+ """
17
+ print(f">>> Lade lokales Ollama-LLM: {MODEL_NAME}")
18
+
19
+ llm = ChatOllama(
20
+ model=MODEL_NAME,
21
+ temperature=0.0, # deterministisch
22
+ base_url="http://127.0.0.1:11434",
23
+ )
24
+
25
+ print(">>> LLM ready.\n")
26
+ return llm
27
+
28
+
29
+ if __name__ == "__main__":
30
+ llm = load_llm()
31
+ print(llm.invoke("Sag einen kurzen Satz auf Deutsch."))
load_documents.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # load_documents.py – Supabase + PDF + Paragraph-Viewer
2
+
3
+ import os
4
+ import requests
5
+ import tempfile
6
+ from supabase import create_client
7
+ from langchain_core.documents import Document
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+
10
+ # ===== ENV =====
11
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
12
+ SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY")
13
+
14
+ if not SUPABASE_URL or not SUPABASE_ANON_KEY:
15
+ raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY")
16
+
17
+ supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
18
+
19
+ # ===== PDF (Prüfungsordnung) im Storage =====
20
+ PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
21
+ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}"
22
+
23
+ # ===== Paragraph-Viewer (hg_clean.html) im Bucket "hg_viewer" =====
24
+ HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
25
+
26
+
27
+ def load_hg_nrw():
28
+ print(">>> Lade Hochschulgesetz NRW (§) aus Tabelle hg_nrw …")
29
+
30
+ rows = (
31
+ supabase.table("hg_nrw")
32
+ .select("*")
33
+ .order("order_index")
34
+ .execute()
35
+ ).data
36
+
37
+ docs = []
38
+ for r in rows:
39
+ abs_id = r["abs_id"] # z.B. para_1
40
+ title = r["title"] # z.B. § 1 (Fn 44) Geltungsbereich
41
+ content = r["content"] # kompletter Text inkl. Fußnoten
42
+
43
+ # HTML-Viewer: <div id="para_1">…</div>
44
+ viewer_url = f"{HG_HTML_URL}#{abs_id}"
45
+
46
+ docs.append(
47
+ Document(
48
+ page_content=f"{title}\n{content}",
49
+ metadata={
50
+ "source": "Hochschulgesetz NRW",
51
+ "paragraph": title,
52
+ "url": viewer_url,
53
+ },
54
+ )
55
+ )
56
+
57
+ print(f"✔ {len(docs)} Paragraphen geladen.\n")
58
+ return docs
59
+
60
+
61
+ def load_pdf():
62
+ print(">>> Lade Prüfungsordnung PDF …")
63
+
64
+ resp = requests.get(PDF_URL)
65
+ resp.raise_for_status()
66
+
67
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
68
+ tmp.write(resp.content)
69
+ path = tmp.name
70
+
71
+ pages = PyPDFLoader(path).load()
72
+
73
+ for i, p in enumerate(pages):
74
+ p.metadata["source"] = "Prüfungsordnung (PDF)"
75
+ p.metadata["page"] = i
76
+ p.metadata["pdf_url"] = PDF_URL
77
+
78
+ print(f"✔ {len(pages)} PDF-Seiten geladen.\n")
79
+ return pages
80
+
81
+
82
+ def load_documents():
83
+ docs = []
84
+ docs.extend(load_hg_nrw())
85
+ docs.extend(load_pdf())
86
+ print(f"✔ DOCUMENTS LOADED: {len(docs)}\n")
87
+ return docs
88
+
89
+
90
+ if __name__ == "__main__":
91
+ d = load_documents()
92
+ print("Example doc:", d[0])
rag_pipeline.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_pipeline.py – SUPABASE RAG VERSION
2
+
3
+ from typing import List, Dict, Any, Tuple
4
+ from langchain_core.messages import SystemMessage, HumanMessage
5
+
6
+ MAX_CHARS = 900
7
+
8
+
9
+ def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
10
+ srcs = []
11
+
12
+ for i, d in enumerate(docs):
13
+ meta = d.metadata
14
+ src = meta.get("source")
15
+ page = meta.get("page")
16
+ snippet = d.page_content[:300].replace("\n", " ")
17
+
18
+ if src == "Prüfungsordnung (PDF)":
19
+ pdf_url = meta["pdf_url"]
20
+ if isinstance(page, int) and pdf_url:
21
+ url = f"{pdf_url}#page={page + 1}"
22
+ else:
23
+ url = pdf_url
24
+
25
+ elif src == "Hochschulgesetz NRW":
26
+ url = meta["url"]
27
+ page = None
28
+
29
+ else:
30
+ url = None
31
+
32
+ srcs.append(
33
+ {
34
+ "id": i + 1,
35
+ "source": src,
36
+ "page": page + 1 if isinstance(page, int) else None,
37
+ "url": url,
38
+ "snippet": snippet,
39
+ }
40
+ )
41
+
42
+ return srcs
43
+
44
+
45
+ def format_context(docs):
46
+ if not docs:
47
+ return "(Kein relevanter Kontext gefunden.)"
48
+
49
+ out_lines = []
50
+ for i, d in enumerate(docs):
51
+ txt = d.page_content[:MAX_CHARS]
52
+ src = d.metadata.get("source")
53
+ page = d.metadata.get("page")
54
+
55
+ if src == "Prüfungsordnung (PDF)" and isinstance(page, int):
56
+ src_str = f"{src}, Seite {page + 1}"
57
+ else:
58
+ src_str = src
59
+
60
+ out_lines.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
61
+
62
+ return "\n\n".join(out_lines)
63
+
64
+
65
+ SYSTEM_PROMPT = """
66
+ Du bist ein juristisch präziser Chatbot für Prüfungsrecht.
67
+ Du nutzt ausschließlich:
68
+
69
+ - die Prüfungsordnung (PDF) und
70
+ - das Hochschulgesetz NRW (Absätze aus der Datenbank)
71
+
72
+ Regeln:
73
+
74
+ 1. Keine Halluzinationen – nur Inhalte aus dem gelieferten Kontext.
75
+ 2. Wenn der Kontext unklar ist, sage ausdrücklich, dass keine sichere
76
+ Aussage möglich ist.
77
+ 3. Antworte immer in gut verständlichem, ganzen Sätzen.
78
+ 4. Nenne, soweit im Kontext erkennbar:
79
+ - Paragraphen oder Überschriften,
80
+ - das Dokument (Prüfungsordnung / Hochschulgesetz NRW),
81
+ - Seitenzahl (bei der Prüfungsordnung).
82
+ """
83
+
84
+
85
+ def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
86
+ docs = retriever.invoke(question)
87
+ context_str = format_context(docs)
88
+
89
+ human = f"""
90
+ FRAGE:
91
+ {question}
92
+
93
+ NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
94
+ {context_str}
95
+
96
+ AUFGABE:
97
+ Erstelle eine juristisch korrekte Antwort ausschließlich auf Basis
98
+ des obigen Kontextes. Wenn der Kontext keine sichere Antwort zulässt,
99
+ sage das ausdrücklich und verzichte auf Spekulationen.
100
+ """
101
+
102
+ msgs = [
103
+ SystemMessage(content=SYSTEM_PROMPT),
104
+ HumanMessage(content=human),
105
+ ]
106
+
107
+ result = chat_model.invoke(msgs)
108
+ answer_text = result.content.strip()
109
+
110
+ sources = build_sources_metadata(docs)
111
+ return answer_text, sources
retriever.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BƯỚC 5: RETRIEVER
3
+ """
4
+
5
+ from langchain_community.vectorstores import FAISS
6
+
7
+ RETRIEVER_K = 4
8
+
9
+
10
+ def get_retriever(vectorstore: FAISS, k: int = RETRIEVER_K):
11
+ print(f">>> Creating retriever with k={k} ...")
12
+ retriever = vectorstore.as_retriever(search_kwargs={"k": k})
13
+ print(">>> Retriever ready.\n")
14
+ return retriever
speech_io.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ speech_io.py – STT + TTS lokal (transformers)
3
+ """
4
+
5
+ from typing import Optional, Tuple
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from scipy.signal import butter, filtfilt
9
+ from transformers import pipeline
10
+
11
+ ASR_MODEL_ID = "openai/whisper-small"
12
+ TTS_MODEL_ID = "facebook/mms-tts-deu"
13
+
14
+ _asr = None
15
+ _tts = None
16
+
17
+
18
+ def get_asr_pipeline():
19
+ global _asr
20
+ if _asr is None:
21
+ print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
22
+ _asr = pipeline(
23
+ task="automatic-speech-recognition",
24
+ model=ASR_MODEL_ID,
25
+ device="cpu",
26
+ return_timestamps=True,
27
+ chunk_length_s=30,
28
+ )
29
+ return _asr
30
+
31
+
32
+ def get_tts_pipeline():
33
+ global _tts
34
+ if _tts is None:
35
+ print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
36
+ _tts = pipeline(
37
+ task="text-to-speech",
38
+ model=TTS_MODEL_ID,
39
+ )
40
+ return _tts
41
+
42
+
43
+ def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
44
+ nyq = 0.5 * fs
45
+ norm_cutoff = cutoff / nyq
46
+ b, a = butter(order, norm_cutoff, btype="high")
47
+ return filtfilt(b, a, data)
48
+
49
+
50
+ def apply_fade(audio, sr, duration_ms=10):
51
+ fade_samples = int(sr * duration_ms / 1000)
52
+ if fade_samples * 2 >= len(audio):
53
+ return audio
54
+ fade_in_curve = np.linspace(0, 1, fade_samples)
55
+ audio[:fade_samples] *= fade_in_curve
56
+ fade_out_curve = np.linspace(1, 0, fade_samples)
57
+ audio[-fade_samples:] *= fade_out_curve
58
+ return audio
59
+
60
+
61
+ def transcribe_audio(audio_path: str) -> str:
62
+ if audio_path is None:
63
+ return ""
64
+ data, sr = sf.read(audio_path)
65
+ if len(data.shape) > 1:
66
+ data = data[:, 0]
67
+ MAX_SAMPLES = sr * 30
68
+ if len(data) > MAX_SAMPLES:
69
+ data = data[:MAX_SAMPLES]
70
+ asr = get_asr_pipeline()
71
+ print(">>> Transkribiere Audio...")
72
+ result = asr({"array": data, "sampling_rate": sr})
73
+ text = result.get("text", "").strip()
74
+ print("ASR:", text)
75
+ return text
76
+
77
+
78
+ def synthesize_speech(text: str):
79
+ if not text or not text.strip():
80
+ return None
81
+ tts = get_tts_pipeline()
82
+ out = tts(text)
83
+ audio = np.array(out["audio"], dtype=np.float32)
84
+ sr = out.get("sampling_rate", 16000)
85
+ if sr is None or sr <= 0 or sr > 65535:
86
+ sr = 16000
87
+ if audio.ndim > 1:
88
+ audio = audio.squeeze()
89
+ if audio.ndim > 1:
90
+ audio = audio[:, 0]
91
+ try:
92
+ audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
93
+ except Exception:
94
+ pass
95
+ max_val = np.max(np.abs(audio))
96
+ if max_val > 0:
97
+ audio = audio / max_val
98
+ audio = apply_fade(audio, sr)
99
+ audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
100
+ return (sr, audio_int16)
split_documents.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+
3
+ CHUNK_SIZE = 1500
4
+ CHUNK_OVERLAP = 200
5
+
6
+
7
+ def split_documents(docs):
8
+ splitter = RecursiveCharacterTextSplitter(
9
+ chunk_size=CHUNK_SIZE,
10
+ chunk_overlap=CHUNK_OVERLAP,
11
+ separators=["\n\n", "\n", ". ", " ", ""],
12
+ )
13
+ chunks = splitter.split_documents(docs)
14
+
15
+ for c in chunks:
16
+ c.metadata["chunk_size"] = CHUNK_SIZE
17
+ c.metadata["chunk_overlap"] = CHUNK_OVERLAP
18
+
19
+ return chunks
20
+
21
+
22
+ if __name__ == "__main__":
23
+ from load_documents import load_documents
24
+
25
+ docs = load_documents()
26
+ chunks = split_documents(docs)
27
+ print("Docs:", len(docs), "Chunks:", len(chunks))
28
+ print(chunks[0].page_content[:300], chunks[0].metadata)
vectorstore.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BƯỚC 4: VECTORSTORE (FAISS in-memory)
3
+ """
4
+
5
+ from langchain_community.vectorstores import FAISS
6
+ from embeddings import get_embeddings
7
+
8
+
9
+ def build_vectorstore(chunks):
10
+ print(">>> Initialising embedding model for FAISS index ...")
11
+ embeddings = get_embeddings()
12
+
13
+ print(f">>> Building FAISS index from {len(chunks)} chunks ...")
14
+ vs = FAISS.from_documents(chunks, embeddings)
15
+ print(">>> FAISS index built.\n")
16
+ return vs
17
+
18
+
19
+ if __name__ == "__main__":
20
+ from load_documents import load_documents
21
+ from split_documents import split_documents
22
+
23
+ docs = load_documents()
24
+ chunks = split_documents(docs)
25
+ vs = build_vectorstore(chunks)
26
+ res = vs.similarity_search(
27
+ "Fristen für die Prüfungsanmeldung im Bachelorstudium", k=3
28
+ )
29
+ for r in res:
30
+ print(r.page_content[:200], r.metadata)