File size: 5,501 Bytes

282e004
82ad235
282e004
ab9166a
82ad235
 
282e004
 
 
 
f539820
358e3ed
91df4e8
282e004
6d4f0f5
77fe7fe
ab9166a
 
285c36e
6d4f0f5
03719c3
cc371e9
03719c3
358e3ed
91df4e8
 
32e5908
91df4e8
 
 
 
 
 
 
358e3ed
282e004
358e3ed
ab9166a
91df4e8
358e3ed
 
03719c3
282e004
358e3ed
 
 
6d4f0f5
03719c3
ab9166a
 
03719c3
6d4f0f5
03719c3
358e3ed
03719c3
358e3ed
 
 
 
 
 
6d4f0f5
 
 
 
 
358e3ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282e004
358e3ed
ab9166a
358e3ed
 
6d4f0f5

:; shopt -s nullglob expand_aliases || setopt nullglob aliases; alias @set='' @echo='echo'
@echo Indeksator Prosty RAG v0.5 - Jerzy Glowacki na licencji Apache 2.0
:; # Zmienne:
@set embedfile=prosty-rag.embedfile
@set embedmodelfile=multilingual-e5-large-instruct.gguf
@set embedmodelURL=https://huggingface.co/kcccat/multilingual-e5-large-instruct-Q6_K-GGUF/resolve/main/multilingual-e5-large-instruct-q6_k.gguf?download=true
@set inputDir=baza
@set chunksFile=chunks.txt
@set dbFile=prosty-rag.db
@set chunkWords=200
@set overlapWords=10
:; # *NIX:
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
:; >$chunksFile
:; # Instalacja
:; [ ! -d $inputDir ] && echo Pobieranie przykładowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && chmod +x $embedfile && echo Gotowe!
:; [ ! -f "$embedmodelfile" ] && echo Pobieranie $embedmodelfile... && curl -Lo "$embedmodelfile" $embedmodelURL && echo Gotowe!
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
:; # Uruchamianie
:; echo "Indeksowanie plików PDF/TXT/MD/CSV w folderze $inputDir..."
:; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && ./pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
:; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
:; for file in $inputDir/*.txt $inputDir/*.md; do
:;   filename=$(basename "$file")
:;   echo "Przetwarzanie $filename..."
:;   IFS=$'\n' read -rd '' -a words < <(tr -s '[:space:]' '\n' '.' < "$file" | grep -v '^$' && printf '\0')
:;   totalWords=${#words[@]}
:;   start=0
:;   while ((start < totalWords)); do
:;     chunk=("${words[@]:start:chunkWords}")
:;     echo "$filename: ${chunk[*]}" >> $chunksFile
:;     ((start += chunkWords - overlapWords))
:;   done
:; done
:; echo "Osadzanie plików w bazie danych..."
:; [ -f $dbFile ] && rm $dbFile
:; ./$embedfile -m $embedmodelfile import $chunksFile $dbFile && ./$embedfile -m $embedmodelfile sh $dbFile "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo "Gotowe! Po każdej zmianie w folderze $inputDir należy uruchomić ponownie indeksator."
:; rm $chunksFile; exit $?
:; # Windows:
@echo off
chcp 65001 >nul
title Prosty RAG
setlocal enabledelayedexpansion
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
break>%chunksFile%
:; # Instalacja
if not exist %inputDir% echo Pobieranie przykładowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true  && echo Gotowe^^!
if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && echo Gotowe^^!
if not exist %embedmodelfile% echo Pobieranie %embedmodelfile%... && curl -Lo %embedmodelfile% %embedmodelURL% && echo Gotowe^^!
if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^^!
:; # Uruchamianie
echo Indeksowanie plików PDF/TXT/MD/CSV w folderze %inputDir%...
for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
for %%F in ("%inputDir%\*.csv") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && powershell -Command "Import-Csv '%%~F' | %% { (($_.PSObject.Properties | %% { \"$($_.Name): $($_.Value)\" }) -join ', ') + '.'} | Out-File '%%~dpnF.txt' -Encoding utf8"
for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
  echo Przetwarzanie %%~nxF...
  (
    set wordCount=0
    set /p =%%~nxF: <nul
    for /f "usebackq delims=" %%L in ("%%F") do (
      set line=%%L
      set line=!line: =^

!
      for /f "delims=" %%W in ("!line!") do (
        set /p =%%W <nul
        for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!)
        set "buf[%overlapWords%]=%%W"
        set /a wordCount+=1
        if !wordCount! geq !chunkWords! (
          echo.
          set /p =%%~nxF: <nul
          for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul
          set /a wordCount=0
        )
      )
    )
    echo.
  )>>%chunksFile%
)
echo Osadzanie plików w bazie danych...
if exist %dbFile% del %dbFile%
%embedfile% -m %embedmodelfile% import %chunksFile% %dbFile% && %embedfile% -m %embedmodelfile% sh %dbFile% "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo Gotowe^^! Po każdej zmianie w folderze %inputDir% należy uruchomić ponownie indeksator.
del %chunksFile%
endlocal
pause