jglowa commited on
Commit
358e3ed
verified
1 Parent(s): 88a671a

Update indeksator.cmd

Browse files
Files changed (1) hide show
  1. indeksator.cmd +83 -43
indeksator.cmd CHANGED
@@ -1,43 +1,83 @@
1
- :; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
2
- @echo off
3
- setlocal enabledelayedexpansion
4
- set embedfile=bge-m3.embedfile
5
- set inputDir=baza
6
- set chunksFile=chunks.txt
7
- set dbFile=prosty-rag.db
8
- set chunkWords=200
9
- set overlapWords=10
10
- for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
11
- break>%chunksFile%
12
- if not exist %inputDir% echo Pobieranie przyk艂adowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
13
- if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://huggingface.co/jglowa/prosty-rag/resolve/main/bge-m3.embedfile?download=true && echo Gotowe!
14
- if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe!
15
- echo Indeksowanie plik贸w PDF/TXT/MD w folderze %inputDir%...
16
- for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
17
- for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
18
- echo Przetwarzanie %%~nxF...
19
- (
20
- set wordCount=0
21
- set /p =%%~nxF: <nul
22
- for /f "usebackq delims=" %%L in ("%%F") do (
23
- for %%W in (%%L) do (
24
- set /p =%%W <nul
25
- for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!)
26
- set "buf[%overlapWords%]=%%W"
27
- set /a wordCount+=1
28
- if !wordCount! geq !chunkWords! (
29
- echo.
30
- set /p =%%~nxF: <nul
31
- for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul
32
- set /a wordCount=0
33
- )
34
- )
35
- )
36
- echo.
37
- )>>%chunksFile%
38
- )
39
- echo Osadzanie plik贸w...
40
- if exist %dbFile% del %dbFile%
41
- %embedfile% import %chunksFile% %dbFile% && echo Gotowe! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
42
- del %chunksFile%
43
- endlocal
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
2
+ :; # *NIX:
3
+ :; embedfile="bge-m3.embedfile"
4
+ :; inputDir="baza"
5
+ :; chunksFile="chunks.txt"
6
+ :; dbFile="prosty-rag.db"
7
+ :; chunkWords=200
8
+ :; overlapWords=10
9
+ :; declare -a buf
10
+ :; for ((i = 0; i < overlapWords; i++)); do buf[i]=""; done
11
+ :; > $chunksFile
12
+ :; [ ! -f $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
13
+ :; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/jglowa/prosty-rag/resolve/main/bge-m3.llamafile?download=true && chmod +x $embedfile && echo Gotowe!
14
+ :; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-linux-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-linux-4.05.tar.gz xpdf-tools-linux-4.05/bin64/pdftotext && del xpdf-tools-linux-4.05.tar.gz && echo Gotowe!
15
+ :; echo "Indeksowanie plik贸w PDF/TXT/MD w folderze $inputDir..."
16
+ :; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
17
+ :; for file in $inputDir/*.txt $inputDir/*.md; do
18
+ :; echo "Przetwarzanie $(basename "$file")..."
19
+ :; wordCount=0
20
+ :; while IFS= read -r line || [ -n "$line" ]; do
21
+ :; for word in $line; do
22
+ :; for ((i = 0; i < overlapWords - 1; i++)); do buf[i]="${buf[i+1]}"; done
23
+ :; buf[overlapWords-1]="$word"
24
+ :; ((wordCount++))
25
+ :; if (( wordCount >= chunkWords )); then
26
+ :; {
27
+ :; printf "%s: " $(basename "$file")
28
+ :; for ((i = 0; i < overlapWords; i++)); do printf "%s " "${buf[i]}"; done
29
+ :; printf "\n"
30
+ :; } >> $chunksFile
31
+ :; wordCount=0
32
+ :; fi
33
+ :; done
34
+ :; done < "$file"
35
+ :; echo >> $chunksFile
36
+ :; done
37
+ :; echo "Osadzanie plik贸w..."
38
+ :; [ -f $dbFile ] && rm $dbFile
39
+ :; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
40
+ :; rm "$chunksFile"; exit $?
41
+ :; # Windows:
42
+ @echo off
43
+ setlocal enabledelayedexpansion
44
+ set embedfile=bge-m3.embedfile
45
+ set inputDir=baza
46
+ set chunksFile=chunks.txt
47
+ set dbFile=prosty-rag.db
48
+ set chunkWords=200
49
+ set overlapWords=10
50
+ for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
51
+ break>%chunksFile%
52
+ if not exist %inputDir% echo Pobieranie przyk艂adowego pliku %inputDir%\wikipedia.txt... && curl --create-dirs -Lo %inputDir%\wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe^!
53
+ if not exist %embedfile% echo Pobieranie %embedfile%... && curl -Lo %embedfile% https://huggingface.co/jglowa/prosty-rag/resolve/main/bge-m3.embedfile?download=true && echo Gotowe^!
54
+ if not exist pdftotext.exe echo Pobieranie pdftotext.exe... && curl -LO https://dl.xpdfreader.com/xpdf-tools-win-4.05.zip && tar --strip-components 2 -xf xpdf-tools-win-4.05.zip xpdf-tools-win-4.05/bin64/pdftotext.exe && del xpdf-tools-win-4.05.zip && echo Gotowe^!
55
+ echo Indeksowanie plik贸w PDF/TXT/MD w folderze %inputDir%...
56
+ for %%F in ("%inputDir%\*.pdf") do if not exist "%%~dpnF.txt" echo Konwertowanie %%~nxF... && pdftotext -nopgbrk -enc UTF-8 "%%~F"
57
+ for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
58
+ echo Przetwarzanie %%~nxF...
59
+ (
60
+ set wordCount=0
61
+ set /p =%%~nxF: <nul
62
+ for /f "usebackq delims=" %%L in ("%%F") do (
63
+ for %%W in (%%L) do (
64
+ set /p =%%W <nul
65
+ for /l %%i in (2,1,%overlapWords%) do (set /a j=%%i-1 && set buf[!j!]=!buf[%%i]!)
66
+ set "buf[%overlapWords%]=%%W"
67
+ set /a wordCount+=1
68
+ if !wordCount! geq !chunkWords! (
69
+ echo.
70
+ set /p =%%~nxF: <nul
71
+ for /l %%i in (1,1,%overlapWords%) do set /p =!buf[%%i]! <nul
72
+ set /a wordCount=0
73
+ )
74
+ )
75
+ )
76
+ echo.
77
+ )>>%chunksFile%
78
+ )
79
+ echo Osadzanie plik贸w...
80
+ if exist %dbFile% del %dbFile%
81
+ %embedfile% import %chunksFile% %dbFile% && echo Gotowe^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
82
+ del %chunksFile%
83
+ endlocal