Update indeksator.cmd
Browse files- indeksator.cmd +13 -23
indeksator.cmd
CHANGED
|
@@ -1,45 +1,35 @@
|
|
| 1 |
:; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
|
| 2 |
:; # *NIX:
|
|
|
|
| 3 |
:; embedfile="bge-m3.embedfile"
|
| 4 |
:; inputDir="baza"
|
| 5 |
:; chunksFile="chunks.txt"
|
| 6 |
:; dbFile="prosty-rag.db"
|
| 7 |
:; chunkWords=200
|
| 8 |
:; overlapWords=10
|
| 9 |
-
:; buf=()
|
| 10 |
-
:; for ((i = 0; i < overlapWords; i++)); do buf[i]=""; done
|
| 11 |
:; > $chunksFile
|
| 12 |
:; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
|
| 13 |
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
|
| 14 |
-
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-
|
| 15 |
:; echo "Indeksowanie plik贸w PDF/TXT/MD w folderze $inputDir..."
|
| 16 |
:; shopt -s nullglob
|
| 17 |
:; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
|
| 18 |
:; for file in $inputDir/*.txt $inputDir/*.md; do
|
| 19 |
-
:;
|
| 20 |
-
:;
|
| 21 |
-
:;
|
| 22 |
-
:;
|
| 23 |
-
:;
|
| 24 |
-
:;
|
| 25 |
-
:;
|
| 26 |
-
:;
|
| 27 |
-
:;
|
| 28 |
-
:;
|
| 29 |
-
:; for ((i = 0; i < overlapWords; i++)); do printf "%s " "${buf[i]}"; done
|
| 30 |
-
:; printf "\n"
|
| 31 |
-
:; } >> $chunksFile
|
| 32 |
-
:; wordCount=0
|
| 33 |
-
:; fi
|
| 34 |
-
:; done
|
| 35 |
-
:; done < "$file"
|
| 36 |
-
:; echo >> $chunksFile
|
| 37 |
:; done
|
| 38 |
:; echo "Osadzanie plik贸w..."
|
| 39 |
:; [ -f $dbFile ] && rm $dbFile
|
| 40 |
:; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
|
| 41 |
-
:;
|
| 42 |
-
:; exit $?
|
| 43 |
:; # Windows:
|
| 44 |
@echo off
|
| 45 |
setlocal enabledelayedexpansion
|
|
|
|
| 1 |
:; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
|
| 2 |
:; # *NIX:
|
| 3 |
+
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
|
| 4 |
:; embedfile="bge-m3.embedfile"
|
| 5 |
:; inputDir="baza"
|
| 6 |
:; chunksFile="chunks.txt"
|
| 7 |
:; dbFile="prosty-rag.db"
|
| 8 |
:; chunkWords=200
|
| 9 |
:; overlapWords=10
|
|
|
|
|
|
|
| 10 |
:; > $chunksFile
|
| 11 |
:; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
|
| 12 |
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
|
| 13 |
+
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && del xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
|
| 14 |
:; echo "Indeksowanie plik贸w PDF/TXT/MD w folderze $inputDir..."
|
| 15 |
:; shopt -s nullglob
|
| 16 |
:; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
|
| 17 |
:; for file in $inputDir/*.txt $inputDir/*.md; do
|
| 18 |
+
:; filename=$(basename "$file")
|
| 19 |
+
:; echo "Przetwarzanie $filename..."
|
| 20 |
+
:; mapfile -t words < <(tr -s '[:space:]' '\n' < "$file" | grep -v '^$')
|
| 21 |
+
:; totalWords=${#words[@]}
|
| 22 |
+
:; start=0
|
| 23 |
+
:; while ((start < totalWords)); do
|
| 24 |
+
:; chunk=("${words[@]:start:chunkWords}")
|
| 25 |
+
:; echo "$filename: ${chunk[*]}" >> $chunksFile
|
| 26 |
+
:; ((start += chunkWords - overlapWords))
|
| 27 |
+
:; done
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
:; done
|
| 29 |
:; echo "Osadzanie plik贸w..."
|
| 30 |
:; [ -f $dbFile ] && rm $dbFile
|
| 31 |
:; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
|
| 32 |
+
:; rm $chunksFile; exit $?
|
|
|
|
| 33 |
:; # Windows:
|
| 34 |
@echo off
|
| 35 |
setlocal enabledelayedexpansion
|