File size: 5,501 Bytes
282e004 82ad235 282e004 ab9166a 82ad235 282e004 f539820 358e3ed 91df4e8 282e004 6d4f0f5 77fe7fe ab9166a 285c36e 6d4f0f5 03719c3 cc371e9 03719c3 358e3ed 91df4e8 32e5908 91df4e8 358e3ed 282e004 358e3ed ab9166a 91df4e8 358e3ed 03719c3 282e004 358e3ed 6d4f0f5 03719c3 ab9166a 03719c3 6d4f0f5 03719c3 358e3ed 03719c3 358e3ed 6d4f0f5 358e3ed 282e004 358e3ed ab9166a 358e3ed 6d4f0f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
:; shopt -s nullglob expand_aliases || setopt nullglob aliases; alias @set='' @echo='echo'
@echo Indeksator Prosty RAG v0.5 - Jerzy Glowacki na licencji Apache 2.0
:; # Zmienne:
@set embedfile=prosty-rag.embedfile
@set embedmodelfile=multilingual-e5-large-instruct.gguf
@set embedmodelURL=https://huggingface.co/kcccat/multilingual-e5-large-instruct-Q6_K-GGUF/resolve/main/multilingual-e5-large-instruct-q6_k.gguf?download=true
@set inputDir=baza
@set chunksFile=chunks.txt
@set dbFile=prosty-rag.db
@set chunkWords=200
@set overlapWords=10
:; # *NIX:
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
:; >$chunksFile
:; # Instalacja
:; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && chmod +x $embedfile && echo Gotowe!
:; [ ! -f "$embedmodelfile" ] && echo Pobieranie $embedmodelfile... && curl -Lo "$embedmodelfile" $embedmodelURL && echo Gotowe!
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
:; # Uruchamianie
:; echo "Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze $inputDir..."
:; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && ./pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
:; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
:; for file in $inputDir/*.txt $inputDir/*.md; do
:; filename=$(basename "$file")
:; echo "Przetwarzanie $filename..."
:; IFS=$'\n' read -rd '' -a words < <(tr -s '[:space:]' '\n' '.' < "$file" | grep -v '^$' && printf '\0')
:; totalWords=${#words[@]}
:; start=0
:; while ((start < totalWords)); do
:; chunk=("${words[@]:start:chunkWords}")
:; echo "$filename: ${chunk[*]}" >> $chunksFile
:; ((start += chunkWords - overlapWords))
:; done
:; done
:; echo "Osadzanie plik贸w w bazie danych..."
:; [ -f $dbFile ] && rm $dbFile
:; ./$embedfile -m $embedmodelfile import $chunksFile $dbFile && ./$embedfile -m $embedmodelfile sh $dbFile "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
:; rm $chunksFile; exit $?
:; # Windows:
@echo off
chcp 65001 >nul
title Prosty RAG
setlocal enabledelayedexpansion
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
break>%chunksFile%
:; # Instalacja
if not exist %inputDir% echo Pobieranie przyk艂adowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe^^!
if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && echo Gotowe^^!
if not exist %embedmodelfile% echo Pobieranie %embedmodelfile%... && curl -Lo %embedmodelfile% %embedmodelURL% && echo Gotowe^^!
if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^^!
:; # Uruchamianie
echo Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze %inputDir%...
for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
for %%F in ("%inputDir%\*.csv") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && powershell -Command "Import-Csv '%%~F' | %% { (($_.PSObject.Properties | %% { \"$($_.Name): $($_.Value)\" }) -join ', ') + '.'} | Out-File '%%~dpnF.txt' -Encoding utf8"
for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
echo Przetwarzanie %%~nxF...
(
set wordCount=0
set /p =%%~nxF: <nul
for /f "usebackq delims=" %%L in ("%%F") do (
set line=%%L
set line=!line: =^
!
for /f "delims=" %%W in ("!line!") do (
set /p =%%W <nul
for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!)
set "buf[%overlapWords%]=%%W"
set /a wordCount+=1
if !wordCount! geq !chunkWords! (
echo.
set /p =%%~nxF: <nul
for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul
set /a wordCount=0
)
)
)
echo.
)>>%chunksFile%
)
echo Osadzanie plik贸w w bazie danych...
if exist %dbFile% del %dbFile%
%embedfile% -m %embedmodelfile% import %chunksFile% %dbFile% && %embedfile% -m %embedmodelfile% sh %dbFile% "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo Gotowe^^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
del %chunksFile%
endlocal
pause
|