jglowa commited on
Commit
282e004
verified
1 Parent(s): 3275223

Update indeksator.cmd

Browse files
Files changed (1) hide show
  1. indeksator.cmd +15 -19
indeksator.cmd CHANGED
@@ -1,20 +1,21 @@
1
- :; # Indeksator Prosty RAG v0.3 - Jerzy G艂owacki na licencji Apache 2.0
 
 
 
 
 
 
 
 
2
  :; # *NIX:
3
  :; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
4
- :; embedfile="bge-m3.embedfile"
5
- :; inputDir="baza"
6
- :; chunksFile="chunks.txt"
7
- :; dbFile="prosty-rag.db"
8
- :; chunkWords=200
9
- :; overlapWords=20
10
- :; > $chunksFile
11
  :; # Instalacja
12
  :; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
13
  :; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
14
  :; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
15
  :; # Uruchamianie
16
  :; echo "Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze $inputDir..."
17
- :; shopt -s nullglob
18
  :; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
19
  :; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
20
  :; for file in $inputDir/*.txt $inputDir/*.md; do
@@ -29,20 +30,15 @@
29
  :; ((start += chunkWords - overlapWords))
30
  :; done
31
  :; done
32
- :; echo "Osadzanie plik贸w..."
33
  :; [ -f $dbFile ] && rm $dbFile
34
- :; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
35
  :; rm $chunksFile; exit $?
36
  :; # Windows:
37
  @echo off
38
  chcp 65001 >nul
 
39
  setlocal enabledelayedexpansion
40
- set embedfile=bge-m3.embedfile
41
- set inputDir=baza
42
- set chunksFile=chunks.txt
43
- set dbFile=prosty-rag.db
44
- set chunkWords=200
45
- set overlapWords=20
46
  for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
47
  break>%chunksFile%
48
  :; # Instalacja
@@ -79,9 +75,9 @@ for %%F in ("%inputDir%\*.txt" "%inputDir%\*.md") do (
79
  echo.
80
  )>>%chunksFile%
81
  )
82
- echo Osadzanie plik贸w...
83
  if exist %dbFile% del %dbFile%
84
- %embedfile% import %chunksFile% %dbFile% && echo Gotowe^^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
85
  del %chunksFile%
86
  endlocal
87
  pause
 
1
+ :; shopt -s nullglob expand_aliases || setopt nullglob aliases; alias @set='' @echo='echo'
2
+ @echo Indeksator Prosty RAG v0.4 - Jerzy Glowacki na licencji Apache 2.0
3
+ :; # Zmienne:
4
+ @set embedfile=bge-m3.embedfile
5
+ @set inputDir=baza
6
+ @set chunksFile=chunks.txt
7
+ @set dbFile=prosty-rag.db
8
+ @set chunkWords=200
9
+ @set overlapWords=20
10
  :; # *NIX:
11
  :; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
12
+ :; >$chunksFile
 
 
 
 
 
 
13
  :; # Instalacja
14
  :; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
15
  :; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
16
  :; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && rm xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
17
  :; # Uruchamianie
18
  :; echo "Indeksowanie plik贸w PDF/TXT/MD/CSV w folderze $inputDir..."
 
19
  :; for pdf in $inputDir/*.pdf; do [ ! -f "${pdf%.pdf}.txt" ] && echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
20
  :; for csv in $inputDir/*.csv; do [ ! -f "${csv%.csv}.txt" ] && echo "Konwertowanie $(basename "$csv")..." && awk -F, -vFPAT='([^,]*)|("([^"]|"")+")' 'NR==1{for(i=1;i<=NF;i++) h[i]=$i; next} {for(i=1;i<=NF;i++) printf "%s: %s%s", h[i], $i, (i<NF? ", ":".\n")}' "$csv" > "${csv%.csv}.txt"; done
21
  :; for file in $inputDir/*.txt $inputDir/*.md; do
 
30
  :; ((start += chunkWords - overlapWords))
31
  :; done
32
  :; done
33
+ :; echo "Osadzanie plik贸w w bazie danych..."
34
  :; [ -f $dbFile ] && rm $dbFile
35
+ :; ./$embedfile import $chunksFile $dbFile && ./$embedfile sh $dbFile "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
36
  :; rm $chunksFile; exit $?
37
  :; # Windows:
38
  @echo off
39
  chcp 65001 >nul
40
+ title Prosty RAG
41
  setlocal enabledelayedexpansion
 
 
 
 
 
 
42
  for /l %%i in (1,1,%overlapWords%) do set buf[%%i]=
43
  break>%chunksFile%
44
  :; # Instalacja
 
75
  echo.
76
  )>>%chunksFile%
77
  )
78
+ echo Osadzanie plik贸w w bazie danych...
79
  if exist %dbFile% del %dbFile%
80
+ %embedfile% import %chunksFile% %dbFile% && %embedfile% sh %dbFile% "CREATE VIRTUAL TABLE fts_items USING fts5(line, tokenize='porter')" "INSERT INTO fts_items SELECT * FROM items" && echo Gotowe^^! Po ka偶dej zmianie w folderze %inputDir% nale偶y uruchomi膰 ponownie indeksator.
81
  del %chunksFile%
82
  endlocal
83
  pause