| :; shopt -s nullglob expand_aliases || setopt nullglob aliases; alias @set='' @echo='echo' | |
| @echo Indeksator Prosty RAG v0.5 - Jerzy Glowacki na licencji Apache 2.0 | |
| :; # Zmienne: | |
| @set embedfile=prosty-rag.embedfile | |
| @set embedmodelfile=multilingual-e5-large-instruct.gguf | |
| @set embedmodelURL=https://huggingface.co/kcccat/multilingual-e5-large-instruct-Q6_K-GGUF/resolve/main/multilingual-e5-large-instruct-q6_k.gguf?download=true | |
| @set inputDir=baza | |
| @set chunksFile=chunks.txt | |
| @set dbFile=prosty-rag.db | |
| @set chunkWords=200 | |
| @set overlapWords=10 | |
| :; # *NIX: | |
| :; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/') | |
| :; >$chunksFile | |
| :; # Instalacja | |
| :; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe! | |
| :; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && chmod +x $embedfile && echo Gotowe! | |
| :; [ ! -f "$embedmodelfile" ] && echo Pobieranie $embedmodelfile... && curl -Lo "$embedmodelfile" $embedmodelURL && echo Gotowe! | |
| :; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe! | |
| :; # Uruchamianie | |
| :; echo "Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze $inputDir..." | |
| :; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && ./pdftotext -nopgbrk -enc UTF-8 "$pdf"; done | |
| :; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done | |
| :; for file in $inputDir/*.txt $inputDir/*.md; do | |
| :; filename=$(basename "$file") | |
| :; echo "Przetwarzanie $filename..." | |
| :; IFS=$'\n' read -rd '' -a words < <(tr -s '[:space:]' '\n' '.' < "$file" | grep -v '^$' && printf '\0') | |
| :; totalWords=${#words[@]} | |
| :; start=0 | |
| :; while ((start < totalWords)); do | |
| :; chunk=("${words[@]:start:chunkWords}") | |
| :; echo "$filename: ${chunk[*]}" >> $chunksFile | |
| :; ((start += chunkWords - overlapWords)) | |
| :; done | |
| :; done | |
| :; echo "Osadzanie plik贸w w bazie danych..." | |
| :; [ -f $dbFile ] && rm $dbFile | |
| :; ./$embedfile -m $embedmodelfile import $chunksFile $dbFile && ./$embedfile -m $embedmodelfile sh $dbFile "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator." | |
| :; rm $chunksFile; exit $? | |
| :; # Windows: | |
| @echo off | |
| chcp 65001 >nul | |
| title Prosty RAG | |
| setlocal enabledelayedexpansion | |
| for /l %%i in (1,1,%overlapWords%) do set buf[%%i]= | |
| break>%chunksFile% | |
| :; # Instalacja | |
| if not exist %inputDir% echo Pobieranie przyk艂adowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe^^! | |
| if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && echo Gotowe^^! | |
| if not exist %embedmodelfile% echo Pobieranie %embedmodelfile%... && curl -Lo %embedmodelfile% %embedmodelURL% && echo Gotowe^^! | |
| if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^^! | |
| :; # Uruchamianie | |
| echo Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze %inputDir%... | |
| for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F" | |
| for %%F in ("%inputDir%\*.csv") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && powershell -Command "Import-Csv '%%~F' | %% { (($_.PSObject.Properties | %% { \"$($_.Name): $($_.Value)\" }) -join ', ') + '.'} | Out-File '%%~dpnF.txt' -Encoding utf8" | |
| for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do ( | |
| echo Przetwarzanie %%~nxF... | |
| ( | |
| set wordCount=0 | |
| set /p =%%~nxF: <nul | |
| for /f "usebackq delims=" %%L in ("%%F") do ( | |
| set line=%%L | |
| set line=!line: =^ | |
| ! | |
| for /f "delims=" %%W in ("!line!") do ( | |
| set /p =%%W <nul | |
| for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!) | |
| set "buf[%overlapWords%]=%%W" | |
| set /a wordCount+=1 | |
| if !wordCount! geq !chunkWords! ( | |
| echo. | |
| set /p =%%~nxF: <nul | |
| for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul | |
| set /a wordCount=0 | |
| ) | |
| ) | |
| ) | |
| echo. | |
| )>>%chunksFile% | |
| ) | |
| echo Osadzanie plik贸w w bazie danych... | |
| if exist %dbFile% del %dbFile% | |
| %embedfile% -m %embedmodelfile% import %chunksFile% %dbFile% && %embedfile% -m %embedmodelfile% sh %dbFile% "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo Gotowe^^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator. | |
| del %chunksFile% | |
| endlocal | |
| pause | |