Nguyen5 commited on
Commit
e640fc1
·
1 Parent(s): 19ea0fe
Files changed (2) hide show
  1. app.py +1 -0
  2. load_documents.py +31 -5
app.py CHANGED
@@ -143,6 +143,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (Supabase + OpenAI)") as demo:
143
  with gr.Column(scale=2):
144
 
145
  chatbot = gr.Chatbot(
 
146
  label="Chat",
147
  height=550,
148
  )
 
143
  with gr.Column(scale=2):
144
 
145
  chatbot = gr.Chatbot(
146
+ type="messages",
147
  label="Chat",
148
  height=550,
149
  )
load_documents.py CHANGED
@@ -1,3 +1,4 @@
 
1
 
2
  import os
3
  import requests
@@ -29,11 +30,7 @@ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}"
29
  # -> in der App: iframe src="file=hg_clean.html"
30
  # -> für Links: "file=hg_clean.html#para_123"
31
  # ---------------------------------------------------------
32
- # HG_HTML_URL = "file=hg_clean.html" # WICHTIG: nicht absolut, Space kümmert sich
33
- #HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/resolve/main/hg_clean.html"
34
- #HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/raw/main/hg_clean.html"
35
-
36
- HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
37
 
38
  def load_hg_nrw():
39
  """
@@ -117,3 +114,32 @@ if __name__ == "__main__":
117
  docs = load_documents()
118
  print(docs[0])
119
  print("Total:", len(docs))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # load_documents.py – Supabase + statischer HTML-Viewer
2
 
3
  import os
4
  import requests
 
30
  # -> in der App: iframe src="file=hg_clean.html"
31
  # -> für Links: "file=hg_clean.html#para_123"
32
  # ---------------------------------------------------------
33
+ HG_HTML_URL = "file=hg_clean.html" # WICHTIG: nicht absolut, Space kümmert sich
 
 
 
 
34
 
35
  def load_hg_nrw():
36
  """
 
114
  docs = load_documents()
115
  print(docs[0])
116
  print("Total:", len(docs))
117
+
118
+ - split_documents.py:
119
+ # split_documents.py – v2
120
+
121
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
122
+
123
+ CHUNK_SIZE = 1500
124
+ CHUNK_OVERLAP = 200
125
+
126
+ def split_documents(docs):
127
+ splitter = RecursiveCharacterTextSplitter(
128
+ chunk_size=CHUNK_SIZE,
129
+ chunk_overlap=CHUNK_OVERLAP,
130
+ separators=["\n\n", "\n", ". ", " ", ""],
131
+ )
132
+ chunks = splitter.split_documents(docs)
133
+
134
+ for c in chunks:
135
+ c.metadata["chunk_size"] = CHUNK_SIZE
136
+ c.metadata["chunk_overlap"] = CHUNK_OVERLAP
137
+
138
+ return chunks
139
+
140
+ if __name__ == "__main__":
141
+ from load_documents import load_documents
142
+ docs = load_documents()
143
+ chunks = split_documents(docs)
144
+ print("Docs:", len(docs), "Chunks:", len(chunks))
145
+ print(chunks[0].page_content[:300], chunks[0].metadata)