Spaces:

Nguyen5
/

chatbot

Sleeping

Nguyen5 commited on Dec 4, 2025

Commit

e640fc1

1 Parent(s): 19ea0fe

commit

Files changed (2) hide show

app.py CHANGED Viewed

@@ -143,6 +143,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (Supabase + OpenAI)") as demo:
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(
                 label="Chat",
                 height=550,
             )

         with gr.Column(scale=2):
             chatbot = gr.Chatbot(
+                type="messages",
                 label="Chat",
                 height=550,
             )

load_documents.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import requests
@@ -29,11 +30,7 @@ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}"
 # -> in der App: iframe src="file=hg_clean.html"
 # -> für Links: "file=hg_clean.html#para_123"
 # ---------------------------------------------------------
-# HG_HTML_URL = "file=hg_clean.html"   # WICHTIG: nicht absolut, Space kümmert sich
-#HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/resolve/main/hg_clean.html"
-#HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/raw/main/hg_clean.html"
-HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
 def load_hg_nrw():
     """
@@ -117,3 +114,32 @@ if __name__ == "__main__":
     docs = load_documents()
     print(docs[0])
     print("Total:", len(docs))

+# load_documents.py – Supabase + statischer HTML-Viewer
 import os
 import requests
 # -> in der App: iframe src="file=hg_clean.html"
 # -> für Links: "file=hg_clean.html#para_123"
 # ---------------------------------------------------------
+HG_HTML_URL = "file=hg_clean.html"   # WICHTIG: nicht absolut, Space kümmert sich
 def load_hg_nrw():
     """
     docs = load_documents()
     print(docs[0])
     print("Total:", len(docs))
+- split_documents.py:
+# split_documents.py – v2
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+CHUNK_SIZE = 1500
+CHUNK_OVERLAP = 200
+def split_documents(docs):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        separators=["\n\n", "\n", ". ", " ", ""],
+    )
+    chunks = splitter.split_documents(docs)
+    for c in chunks:
+        c.metadata["chunk_size"] = CHUNK_SIZE
+        c.metadata["chunk_overlap"] = CHUNK_OVERLAP
+    return chunks
+if __name__ == "__main__":
+    from load_documents import load_documents
+    docs = load_documents()
+    chunks = split_documents(docs)
+    print("Docs:", len(docs), "Chunks:", len(chunks))
+    print(chunks[0].page_content[:300], chunks[0].metadata)