jglowa commited on
Commit
91df4e8
verified
1 Parent(s): acc5ae8

Update indeksator.cmd

Browse files
Files changed (1) hide show
  1. indeksator.cmd +13 -23
indeksator.cmd CHANGED
@@ -1,45 +1,35 @@
1
  :; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
2
  :; # *NIX:
 
3
  :; embedfile="bge-m3.embedfile"
4
  :; inputDir="baza"
5
  :; chunksFile="chunks.txt"
6
  :; dbFile="prosty-rag.db"
7
  :; chunkWords=200
8
  :; overlapWords=10
9
- :; buf=()
10
- :; for ((i = 0; i < overlapWords; i++)); do buf[i]=""; done
11
  :; > $chunksFile
12
  :; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
13
  :; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
14
- :; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-linux-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-linux-4.05.tar.gz xpdf-tools-linux-4.05/bin64/pdftotext && del xpdf-tools-linux-4.05.tar.gz && echo Gotowe!
15
  :; echo "Indeksowanie plik贸w PDF/TXT/MD w folderze $inputDir..."
16
  :; shopt -s nullglob
17
  :; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
18
  :; for file in $inputDir/*.txt $inputDir/*.md; do
19
- :; echo "Przetwarzanie $(basename "$file")..."
20
- :; wordCount=0
21
- :; while IFS= read -r line || [ -n "$line" ]; do
22
- :; for word in $line; do
23
- :; for ((i = 0; i < overlapWords - 1; i++)); do buf[i]="${buf[i+1]}"; done
24
- :; buf[overlapWords-1]="$word"
25
- :; ((wordCount++))
26
- :; if (( wordCount >= chunkWords )); then
27
- :; {
28
- :; printf "%s: " $(basename "$file")
29
- :; for ((i = 0; i < overlapWords; i++)); do printf "%s " "${buf[i]}"; done
30
- :; printf "\n"
31
- :; } >> $chunksFile
32
- :; wordCount=0
33
- :; fi
34
- :; done
35
- :; done < "$file"
36
- :; echo >> $chunksFile
37
  :; done
38
  :; echo "Osadzanie plik贸w..."
39
  :; [ -f $dbFile ] && rm $dbFile
40
  :; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
41
- :; # rm $chunksFile
42
- :; exit $?
43
  :; # Windows:
44
  @echo off
45
  setlocal enabledelayedexpansion
 
1
  :; # Indeksator Prosty RAG v0.1 - Jerzy G艂owacki na licencji Apache 2.0
2
  :; # *NIX:
3
+ :; OS=$(uname -s | sed -e 's/^Linux$/linux/' -e 's/^Darwin$/mac/')
4
  :; embedfile="bge-m3.embedfile"
5
  :; inputDir="baza"
6
  :; chunksFile="chunks.txt"
7
  :; dbFile="prosty-rag.db"
8
  :; chunkWords=200
9
  :; overlapWords=10
 
 
10
  :; > $chunksFile
11
  :; [ ! -d $inputDir ] && echo Pobieranie przyk艂adowego pliku $inputDir/wikipedia.txt... && curl --create-dirs -Lo $inputDir/wikipedia.txt https://huggingface.co/jglowa/prosty-rag/resolve/main/baza/wikipedia.txt?download=true && echo Gotowe!
12
  :; [ ! -f $embedfile ] && echo Pobieranie $embedfile... && curl -Lo $embedfile https://huggingface.co/asg017/embedfile/resolve/refs%2Fpr%2F2/bge-m3.embedfile?download=true && chmod +x $embedfile && echo Gotowe!
13
+ :; [ ! -f pdftotext ] && echo Pobieranie pdftotext... && curl -LO https://dl.xpdfreader.com/xpdf-tools-$OS-4.05.tar.gz && tar --strip-components 2 -xzf xpdf-tools-$OS-4.05.tar.gz xpdf-tools-$OS-4.05/bin64/pdftotext && del xpdf-tools-$OS-4.05.tar.gz && echo Gotowe!
14
  :; echo "Indeksowanie plik贸w PDF/TXT/MD w folderze $inputDir..."
15
  :; shopt -s nullglob
16
  :; for pdf in $inputDir/*.pdf; do echo "Konwertowanie $(basename "$pdf")..." && pdftotext -nopgbrk -enc UTF-8 "$pdf"; done
17
  :; for file in $inputDir/*.txt $inputDir/*.md; do
18
+ :; filename=$(basename "$file")
19
+ :; echo "Przetwarzanie $filename..."
20
+ :; mapfile -t words < <(tr -s '[:space:]' '\n' < "$file" | grep -v '^$')
21
+ :; totalWords=${#words[@]}
22
+ :; start=0
23
+ :; while ((start < totalWords)); do
24
+ :; chunk=("${words[@]:start:chunkWords}")
25
+ :; echo "$filename: ${chunk[*]}" >> $chunksFile
26
+ :; ((start += chunkWords - overlapWords))
27
+ :; done
 
 
 
 
 
 
 
 
28
  :; done
29
  :; echo "Osadzanie plik贸w..."
30
  :; [ -f $dbFile ] && rm $dbFile
31
  :; ./$embedfile import $chunksFile $dbFile && echo "Gotowe! Po ka偶dej zmianie w folderze $inputDir nale偶y uruchomi膰 ponownie indeksator."
32
+ :; rm $chunksFile; exit $?
 
33
  :; # Windows:
34
  @echo off
35
  setlocal enabledelayedexpansion