File size: 5,501 Bytes
282e004
82ad235
282e004
ab9166a
82ad235
 
282e004
 
 
 
f539820
358e3ed
91df4e8
282e004
6d4f0f5
77fe7fe
ab9166a
 
285c36e
6d4f0f5
03719c3
cc371e9
03719c3
358e3ed
91df4e8
 
32e5908
91df4e8
 
 
 
 
 
 
358e3ed
282e004
358e3ed
ab9166a
91df4e8
358e3ed
 
03719c3
282e004
358e3ed
 
 
6d4f0f5
03719c3
ab9166a
 
03719c3
6d4f0f5
03719c3
358e3ed
03719c3
358e3ed
 
 
 
 
 
6d4f0f5
 
 
 
 
358e3ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282e004
358e3ed
ab9166a
358e3ed
 
6d4f0f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
:; shopt -s nullglob expand_aliases || setopt nullglob aliases; alias @set='' @echo='echo'
@echo Indeksator Prosty RAG v0.5 - Jerzy Glowacki na licencji Apache 2.0
:; # Zmienne:
@set embedfile=prosty-rag.embedfile
@set embedmodelfile=multilingual-e5-large-instruct.gguf
@set embedmodelURL=https://huggingface.co/kcccat/multilingual-e5-large-instruct-Q6_K-GGUF/resolve/main/multilingual-e5-large-instruct-q6_k.gguf?download=true
@set inputDir=baza
@set chunksFile=chunks.txt
@set dbFile=prosty-rag.db
@set chunkWords=200
@set overlapWords=10
:; # *NIX:
:; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
:; >$chunksFile
:; # Instalacja
:; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
:; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && chmod +x $embedfile && echo Gotowe!
:; [ ! -f "$embedmodelfile" ] && echo Pobieranie $embedmodelfile... && curl -Lo "$embedmodelfile" $embedmodelURL && echo Gotowe!
:; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
:; # Uruchamianie
:; echo "Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze $inputDir..."
:; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && ./pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
:; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
:; for file in $inputDir/*.txt $inputDir/*.md; do
:;   filename=$(basename "$file")
:;   echo "Przetwarzanie $filename..."
:;   IFS=$'\n' read -rd '' -a words < <(tr -s '[:space:]' '\n' '.' < "$file" | grep -v '^$' && printf '\0')
:;   totalWords=${#words[@]}
:;   start=0
:;   while ((start < totalWords)); do
:;     chunk=("${words[@]:start:chunkWords}")
:;     echo "$filename: ${chunk[*]}" >> $chunksFile
:;     ((start += chunkWords - overlapWords))
:;   done
:; done
:; echo "Osadzanie plik贸w w bazie danych..."
:; [ -f $dbFile ] && rm $dbFile
:; ./$embedfile -m $embedmodelfile import $chunksFile $dbFile && ./$embedfile -m $embedmodelfile sh $dbFile "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
:; rm $chunksFile; exit $?
:; # Windows:
@echo off
chcp 65001 >nul
title Prosty RAG
setlocal enabledelayedexpansion
for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
break>%chunksFile%
:; # Instalacja
if not exist %inputDir% echo Pobieranie przyk艂adowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true  && echo Gotowe^^!
if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://github.com/niutech/llamafile/releases/download/0.9.3/embedfile && echo Gotowe^^!
if not exist %embedmodelfile% echo Pobieranie %embedmodelfile%... && curl -Lo %embedmodelfile% %embedmodelURL% && echo Gotowe^^!
if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^^!
:; # Uruchamianie
echo Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze %inputDir%...
for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
for %%F in ("%inputDir%\*.csv") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && powershell -Command "Import-Csv '%%~F' | %% { (($_.PSObject.Properties | %% { \"$($_.Name): $($_.Value)\" }) -join ', ') + '.'} | Out-File '%%~dpnF.txt' -Encoding utf8"
for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
  echo Przetwarzanie %%~nxF...
  (
    set wordCount=0
    set /p =%%~nxF: <nul
    for /f "usebackq delims=" %%L in ("%%F") do (
      set line=%%L
      set line=!line: =^

!
      for /f "delims=" %%W in ("!line!") do (
        set /p =%%W <nul
        for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!)
        set "buf[%overlapWords%]=%%W"
        set /a wordCount+=1
        if !wordCount! geq !chunkWords! (
          echo.
          set /p =%%~nxF: <nul
          for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul
          set /a wordCount=0
        )
      )
    )
    echo.
  )>>%chunksFile%
)
echo Osadzanie plik贸w w bazie danych...
if exist %dbFile% del %dbFile%
%embedfile% -m %embedmodelfile% import %chunksFile% %dbFile% && %embedfile% -m %embedmodelfile% sh %dbFile% "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo Gotowe^^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
del %chunksFile%
endlocal
pause