Spaces:
Sleeping
Sleeping
Blue2962 commited on
Commit ·
1fd68ae
1
Parent(s): b3884c2
- .gitattributes +0 -35
- .gitignore +2 -1
- README.md +0 -12
- __pycache__/pinecone_func.cpython-311.pyc +0 -0
- index.html +0 -17
- upload_knowledge.py +22 -0
- utils/chunking.py +11 -0
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
.env
|
| 2 |
-
firebase_key.json
|
|
|
|
|
|
| 1 |
.env
|
| 2 |
+
firebase_key.json
|
| 3 |
+
docs
|
README.md
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Dr.Yasuda Streamlit
|
| 3 |
-
emoji: 🚀
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: red
|
| 6 |
-
sdk: docker
|
| 7 |
-
app_port: 8501
|
| 8 |
-
tags:
|
| 9 |
-
- streamlit
|
| 10 |
-
pinned: false
|
| 11 |
-
short_description: Streamlit template space
|
| 12 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/pinecone_func.cpython-311.pyc
DELETED
|
Binary file (2.57 kB)
|
|
|
index.html
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset="UTF-8">
|
| 5 |
-
<title>Dr Yasuda App</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { margin: 0; }
|
| 8 |
-
canvas { display: block; }
|
| 9 |
-
</style>
|
| 10 |
-
</head>
|
| 11 |
-
<body>
|
| 12 |
-
<script src="https://cdn.jsdelivr.net/npm/three@0.158.0/build/three.min.js"></script>
|
| 13 |
-
<script>
|
| 14 |
-
|
| 15 |
-
</script>
|
| 16 |
-
</body>
|
| 17 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
upload_knowledge.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from api.pinecone_func import upsert_texts
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from utils.chunking import chunk_text
|
| 4 |
+
import uuid
|
| 5 |
+
|
| 6 |
+
def load_documents(folder="docs"):
|
| 7 |
+
texts = []
|
| 8 |
+
ids = []
|
| 9 |
+
for file in Path(folder).glob("*.txt"):
|
| 10 |
+
with open(file, "r", encoding="utf-8") as f:
|
| 11 |
+
content = f.read()
|
| 12 |
+
chunks = chunk_text(content)
|
| 13 |
+
for chunk in chunks:
|
| 14 |
+
texts.append(chunk)
|
| 15 |
+
ids.append(str(uuid.uuid4()))
|
| 16 |
+
return texts, ids
|
| 17 |
+
|
| 18 |
+
if __name__ == "__main__":
|
| 19 |
+
texts, ids = load_documents()
|
| 20 |
+
print(f"{len(texts)}件の知識チャンクをアップロード中")
|
| 21 |
+
upsert_texts(texts, ids)
|
| 22 |
+
print("アップロード完了")
|
utils/chunking.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def chunk_text(text: str, max_length: int = 1000):
|
| 2 |
+
chunks = []
|
| 3 |
+
while len(text) > max_length:
|
| 4 |
+
split_pos = text.rfind("。", 0, max_length)
|
| 5 |
+
if split_pos == -1:
|
| 6 |
+
split_pos = max_length
|
| 7 |
+
chunks.append(text[:split_pos + 1].strip())
|
| 8 |
+
text = text[split_pos + 1:]
|
| 9 |
+
if text:
|
| 10 |
+
chunks.append(text.strip())
|
| 11 |
+
return chunks
|