Update indeksator.cmd
Browse files- indeksator.cmd +83 -43
indeksator.cmd
CHANGED
|
@@ -1,43 +1,83 @@
|
|
| 1 |
-
:; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
for
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
echo Indeksowanie plik贸w PDF/TXT/MD w folderze
|
| 16 |
-
for
|
| 17 |
-
for
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
echo
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
|
| 2 |
+
:; # *NIX:
|
| 3 |
+
:; embedfile="bge-m3.embedfile"
|
| 4 |
+
:; inputDir="baza"
|
| 5 |
+
:; chunksFile="chunks.txt"
|
| 6 |
+
:; dbFile="prosty-rag.db"
|
| 7 |
+
:; chunkWords=200
|
| 8 |
+
:; overlapWords=10
|
| 9 |
+
:; declare -a buf
|
| 10 |
+
:; for ((i = 0; i < overlapWords; i++)); do buf[i]=""; done
|
| 11 |
+
:; > $chunksFile
|
| 12 |
+
:; [ ! -f $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
|
| 13 |
+
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/jglowa/prosty-rag/resolve/main/bge-m3.llamafile?download=true && chmod +x $embedfile && echo Gotowe!
|
| 14 |
+
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-linux-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-linux-4.05.tar.gz xpdf-tools-linux-4.05/bin64/pdftotext && del xpdf-tools-linux-4.05.tar.gz && echo Gotowe!
|
| 15 |
+
:; echo "Indeksowanie plik贸w PDF/TXT/MD w folderze $inputDir..."
|
| 16 |
+
:; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
|
| 17 |
+
:; for file in $inputDir/*.txt $inputDir/*.md; do
|
| 18 |
+
:; echo "Przetwarzanie $(basename "$file")..."
|
| 19 |
+
:; wordCount=0
|
| 20 |
+
:; while IFS= read -r line || [ -n "$line" ]; do
|
| 21 |
+
:; for word in $line; do
|
| 22 |
+
:; for ((i = 0; i < overlapWords - 1; i++)); do buf[i]="${buf[i+1]}"; done
|
| 23 |
+
:; buf[overlapWords-1]="$word"
|
| 24 |
+
:; ((wordCount++))
|
| 25 |
+
:; if (( wordCount >= chunkWords )); then
|
| 26 |
+
:; {
|
| 27 |
+
:; printf "%s: " $(basename "$file")
|
| 28 |
+
:; for ((i = 0; i < overlapWords; i++)); do printf "%s " "${buf[i]}"; done
|
| 29 |
+
:; printf "\n"
|
| 30 |
+
:; } >> $chunksFile
|
| 31 |
+
:; wordCount=0
|
| 32 |
+
:; fi
|
| 33 |
+
:; done
|
| 34 |
+
:; done < "$file"
|
| 35 |
+
:; echo >> $chunksFile
|
| 36 |
+
:; done
|
| 37 |
+
:; echo "Osadzanie plik贸w..."
|
| 38 |
+
:; [ -f $dbFile ] && rm $dbFile
|
| 39 |
+
:; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
|
| 40 |
+
:; rm "$chunksFile"; exit $?
|
| 41 |
+
:; # Windows:
|
| 42 |
+
@echo off
|
| 43 |
+
setlocal enabledelayedexpansion
|
| 44 |
+
set embedfile=bge-m3.embedfile
|
| 45 |
+
set inputDir=baza
|
| 46 |
+
set chunksFile=chunks.txt
|
| 47 |
+
set dbFile=prosty-rag.db
|
| 48 |
+
set chunkWords=200
|
| 49 |
+
set overlapWords=10
|
| 50 |
+
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
|
| 51 |
+
break>%chunksFile%
|
| 52 |
+
if not exist %inputDir% echo Pobieranie przyk艂adowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe^!
|
| 53 |
+
if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://huggingface.co/jglowa/prosty-rag/resolve/main/bge-m3.embedfile?download=true && echo Gotowe^!
|
| 54 |
+
if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^!
|
| 55 |
+
echo Indeksowanie plik贸w PDF/TXT/MD w folderze %inputDir%...
|
| 56 |
+
for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
|
| 57 |
+
for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
|
| 58 |
+
echo Przetwarzanie %%~nxF...
|
| 59 |
+
(
|
| 60 |
+
set wordCount=0
|
| 61 |
+
set /p =%%~nxF: <nul
|
| 62 |
+
for /f "usebackq delims=" %%L in ("%%F") do (
|
| 63 |
+
for %%W in (%%L) do (
|
| 64 |
+
set /p =%%W <nul
|
| 65 |
+
for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!)
|
| 66 |
+
set "buf[%overlapWords%]=%%W"
|
| 67 |
+
set /a wordCount+=1
|
| 68 |
+
if !wordCount! geq !chunkWords! (
|
| 69 |
+
echo.
|
| 70 |
+
set /p =%%~nxF: <nul
|
| 71 |
+
for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul
|
| 72 |
+
set /a wordCount=0
|
| 73 |
+
)
|
| 74 |
+
)
|
| 75 |
+
)
|
| 76 |
+
echo.
|
| 77 |
+
)>>%chunksFile%
|
| 78 |
+
)
|
| 79 |
+
echo Osadzanie plik贸w...
|
| 80 |
+
if exist %dbFile% del %dbFile%
|
| 81 |
+
%embedfile% import %chunksFile% %dbFile% && echo Gotowe^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
|
| 82 |
+
del %chunksFile%
|
| 83 |
+
endlocal
|