commit
Browse files- app.py +1 -0
- load_documents.py +31 -5
app.py
CHANGED
|
@@ -143,6 +143,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (Supabase + OpenAI)") as demo:
|
|
| 143 |
with gr.Column(scale=2):
|
| 144 |
|
| 145 |
chatbot = gr.Chatbot(
|
|
|
|
| 146 |
label="Chat",
|
| 147 |
height=550,
|
| 148 |
)
|
|
|
|
| 143 |
with gr.Column(scale=2):
|
| 144 |
|
| 145 |
chatbot = gr.Chatbot(
|
| 146 |
+
type="messages",
|
| 147 |
label="Chat",
|
| 148 |
height=550,
|
| 149 |
)
|
load_documents.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
|
| 2 |
import os
|
| 3 |
import requests
|
|
@@ -29,11 +30,7 @@ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}"
|
|
| 29 |
# -> in der App: iframe src="file=hg_clean.html"
|
| 30 |
# -> für Links: "file=hg_clean.html#para_123"
|
| 31 |
# ---------------------------------------------------------
|
| 32 |
-
|
| 33 |
-
#HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/resolve/main/hg_clean.html"
|
| 34 |
-
#HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/raw/main/hg_clean.html"
|
| 35 |
-
|
| 36 |
-
HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
|
| 37 |
|
| 38 |
def load_hg_nrw():
|
| 39 |
"""
|
|
@@ -117,3 +114,32 @@ if __name__ == "__main__":
|
|
| 117 |
docs = load_documents()
|
| 118 |
print(docs[0])
|
| 119 |
print("Total:", len(docs))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# load_documents.py – Supabase + statischer HTML-Viewer
|
| 2 |
|
| 3 |
import os
|
| 4 |
import requests
|
|
|
|
| 30 |
# -> in der App: iframe src="file=hg_clean.html"
|
| 31 |
# -> für Links: "file=hg_clean.html#para_123"
|
| 32 |
# ---------------------------------------------------------
|
| 33 |
+
HG_HTML_URL = "file=hg_clean.html" # WICHTIG: nicht absolut, Space kümmert sich
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def load_hg_nrw():
|
| 36 |
"""
|
|
|
|
| 114 |
docs = load_documents()
|
| 115 |
print(docs[0])
|
| 116 |
print("Total:", len(docs))
|
| 117 |
+
|
| 118 |
+
- split_documents.py:
|
| 119 |
+
# split_documents.py – v2
|
| 120 |
+
|
| 121 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 122 |
+
|
| 123 |
+
CHUNK_SIZE = 1500
|
| 124 |
+
CHUNK_OVERLAP = 200
|
| 125 |
+
|
| 126 |
+
def split_documents(docs):
|
| 127 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 128 |
+
chunk_size=CHUNK_SIZE,
|
| 129 |
+
chunk_overlap=CHUNK_OVERLAP,
|
| 130 |
+
separators=["\n\n", "\n", ". ", " ", ""],
|
| 131 |
+
)
|
| 132 |
+
chunks = splitter.split_documents(docs)
|
| 133 |
+
|
| 134 |
+
for c in chunks:
|
| 135 |
+
c.metadata["chunk_size"] = CHUNK_SIZE
|
| 136 |
+
c.metadata["chunk_overlap"] = CHUNK_OVERLAP
|
| 137 |
+
|
| 138 |
+
return chunks
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
from load_documents import load_documents
|
| 142 |
+
docs = load_documents()
|
| 143 |
+
chunks = split_documents(docs)
|
| 144 |
+
print("Docs:", len(docs), "Chunks:", len(chunks))
|
| 145 |
+
print(chunks[0].page_content[:300], chunks[0].metadata)
|