Update indeksator.cmd
Browse files- indeksator.cmd +15 -19
indeksator.cmd
CHANGED
|
@@ -1,20 +1,21 @@
|
|
| 1 |
-
:;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
:; # *NIX:
|
| 3 |
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
|
| 4 |
-
:;
|
| 5 |
-
:; inputDir="baza"
|
| 6 |
-
:; chunksFile="chunks.txt"
|
| 7 |
-
:; dbFile="prosty-rag.db"
|
| 8 |
-
:; chunkWords=200
|
| 9 |
-
:; overlapWords=20
|
| 10 |
-
:; > $chunksFile
|
| 11 |
:; # Instalacja
|
| 12 |
:; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
|
| 13 |
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
|
| 14 |
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
|
| 15 |
:; # Uruchamianie
|
| 16 |
:; echo "Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze $inputDir..."
|
| 17 |
-
:; shopt -s nullglob
|
| 18 |
:; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
|
| 19 |
:; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
|
| 20 |
:; for file in $inputDir/*.txt $inputDir/*.md; do
|
|
@@ -29,20 +30,15 @@
|
|
| 29 |
:; ((start += chunkWords - overlapWords))
|
| 30 |
:; done
|
| 31 |
:; done
|
| 32 |
-
:; echo "Osadzanie plik贸w..."
|
| 33 |
:; [ -f $dbFile ] && rm $dbFile
|
| 34 |
-
:; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
|
| 35 |
:; rm $chunksFile; exit $?
|
| 36 |
:; # Windows:
|
| 37 |
@echo off
|
| 38 |
chcp 65001 >nul
|
|
|
|
| 39 |
setlocal enabledelayedexpansion
|
| 40 |
-
set embedfile=bge-m3.embedfile
|
| 41 |
-
set inputDir=baza
|
| 42 |
-
set chunksFile=chunks.txt
|
| 43 |
-
set dbFile=prosty-rag.db
|
| 44 |
-
set chunkWords=200
|
| 45 |
-
set overlapWords=20
|
| 46 |
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
|
| 47 |
break>%chunksFile%
|
| 48 |
:; # Instalacja
|
|
@@ -79,9 +75,9 @@ for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
|
|
| 79 |
echo.
|
| 80 |
)>>%chunksFile%
|
| 81 |
)
|
| 82 |
-
echo Osadzanie plik贸w...
|
| 83 |
if exist %dbFile% del %dbFile%
|
| 84 |
-
%embedfile% import %chunksFile% %dbFile% && echo Gotowe^^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
|
| 85 |
del %chunksFile%
|
| 86 |
endlocal
|
| 87 |
pause
|
|
|
|
| 1 |
+
:; shopt -s nullglob expand_aliases || setopt nullglob aliases; alias @set='' @echo='echo'
|
| 2 |
+
@echo Indeksator Prosty RAG v0.4 - Jerzy Glowacki na licencji Apache 2.0
|
| 3 |
+
:; # Zmienne:
|
| 4 |
+
@set embedfile=bge-m3.embedfile
|
| 5 |
+
@set inputDir=baza
|
| 6 |
+
@set chunksFile=chunks.txt
|
| 7 |
+
@set dbFile=prosty-rag.db
|
| 8 |
+
@set chunkWords=200
|
| 9 |
+
@set overlapWords=20
|
| 10 |
:; # *NIX:
|
| 11 |
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
|
| 12 |
+
:; >$chunksFile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
:; # Instalacja
|
| 14 |
:; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
|
| 15 |
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
|
| 16 |
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
|
| 17 |
:; # Uruchamianie
|
| 18 |
:; echo "Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze $inputDir..."
|
|
|
|
| 19 |
:; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
|
| 20 |
:; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
|
| 21 |
:; for file in $inputDir/*.txt $inputDir/*.md; do
|
|
|
|
| 30 |
:; ((start += chunkWords - overlapWords))
|
| 31 |
:; done
|
| 32 |
:; done
|
| 33 |
+
:; echo "Osadzanie plik贸w w bazie danych..."
|
| 34 |
:; [ -f $dbFile ] && rm $dbFile
|
| 35 |
+
:; ./$embedfile import $chunksFile $dbFile && ./$embedfile sh $dbFile "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
|
| 36 |
:; rm $chunksFile; exit $?
|
| 37 |
:; # Windows:
|
| 38 |
@echo off
|
| 39 |
chcp 65001 >nul
|
| 40 |
+
title Prosty RAG
|
| 41 |
setlocal enabledelayedexpansion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
|
| 43 |
break>%chunksFile%
|
| 44 |
:; # Instalacja
|
|
|
|
| 75 |
echo.
|
| 76 |
)>>%chunksFile%
|
| 77 |
)
|
| 78 |
+
echo Osadzanie plik贸w w bazie danych...
|
| 79 |
if exist %dbFile% del %dbFile%
|
| 80 |
+
%embedfile% import %chunksFile% %dbFile% && %embedfile% sh %dbFile% "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo Gotowe^^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
|
| 81 |
del %chunksFile%
|
| 82 |
endlocal
|
| 83 |
pause
|