João Lima commited on
Commit
26b761b
·
1 Parent(s): c4ff6b7

changing gradio version

Browse files
Files changed (3) hide show
  1. app.py +3 -3
  2. ingestion/pdf.py +22 -11
  3. rag/retriever.py +0 -4
app.py CHANGED
@@ -16,13 +16,13 @@ def load_document(file):
16
 
17
  def ask(question):
18
  if vectorstore is None:
19
- return "Upload a document first", "", ""
20
  if not question.strip():
21
- return "Please enter a question", "", ""
22
  try:
23
  return run_rag(question, vectorstore)
24
  except Exception as e:
25
- return f"Error: {str(e)}", "", ""
26
 
27
  with gr.Blocks(title="Tech Explainer RAG") as demo:
28
  gr.Markdown("# Tech Explainer — RAG with Automatic Evaluation")
 
16
 
17
  def ask(question):
18
  if vectorstore is None:
19
+ return "Upload a document first", "", ""
20
  if not question.strip():
21
+ return "Please enter a question", "", ""
22
  try:
23
  return run_rag(question, vectorstore)
24
  except Exception as e:
25
+ return f"Error: {str(e)}", "", ""
26
 
27
  with gr.Blocks(title="Tech Explainer RAG") as demo:
28
  gr.Markdown("# Tech Explainer — RAG with Automatic Evaluation")
ingestion/pdf.py CHANGED
@@ -1,22 +1,33 @@
1
  from pypdf import PdfReader
2
- from langchain_text_splitter import RecursiveCharacterTextSplitter
3
- from langchain_core.document import Document
4
  from rag.retriever import build_vectorstore
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def process_pdf(file_path):
8
- """
9
- file_path: string - caminho para o arquivo PDF
10
- """
11
  reader = PdfReader(file_path)
12
  text = "".join(page.extract_text() or "" for page in reader.pages)
13
 
14
- splitter = RecursiveCharacterTextSplitter(
15
- chunk_size=500,
16
- chunk_overlap=100
17
- )
18
-
19
- chunks = splitter.split_text(text)
20
  documents = [Document(page_content=c) for c in chunks]
21
 
22
  return build_vectorstore(documents)
 
1
  from pypdf import PdfReader
 
 
2
  from rag.retriever import build_vectorstore
3
 
4
 
5
+ class Document:
6
+ def __init__(self, page_content, metadata=None):
7
+ self.page_content = page_content
8
+ self.metadata = metadata or {}
9
+
10
+
11
+ def split_text(text, chunk_size=500, chunk_overlap=100):
12
+ chunks = []
13
+ start = 0
14
+ text_length = len(text)
15
+
16
+ while start < text_length:
17
+ end = start + chunk_size
18
+ chunk = text[start:end]
19
+ if chunk.strip():
20
+ chunks.append(chunk)
21
+ start += chunk_size - chunk_overlap
22
+
23
+ return chunks
24
+
25
+
26
  def process_pdf(file_path):
 
 
 
27
  reader = PdfReader(file_path)
28
  text = "".join(page.extract_text() or "" for page in reader.pages)
29
 
30
+ chunks = split_text(text, chunk_size=500, chunk_overlap=100)
 
 
 
 
 
31
  documents = [Document(page_content=c) for c in chunks]
32
 
33
  return build_vectorstore(documents)
rag/retriever.py CHANGED
@@ -5,7 +5,6 @@ from config import EMBEDDING_MODEL
5
 
6
 
7
  class SimpleVectorStore:
8
- """Vectorstore simples usando FAISS"""
9
 
10
  def __init__(self, embeddings, documents):
11
  self.embeddings = embeddings
@@ -14,7 +13,6 @@ class SimpleVectorStore:
14
  self._build_index()
15
 
16
  def _build_index(self):
17
- """Constrói índice FAISS"""
18
  texts = [doc.page_content for doc in self.documents]
19
  vectors = self.embeddings.encode(texts)
20
 
@@ -23,7 +21,6 @@ class SimpleVectorStore:
23
  self.index.add(np.array(vectors).astype('float32'))
24
 
25
  def similarity_search(self, query, k=3):
26
- """Busca os k documentos mais similares"""
27
  query_vector = self.embeddings.encode([query])
28
  distances, indices = self.index.search(
29
  np.array(query_vector).astype('float32'),
@@ -33,7 +30,6 @@ class SimpleVectorStore:
33
  return [self.documents[i] for i in indices[0]]
34
 
35
 
36
- # Carrega o modelo de embeddings uma vez
37
  _embeddings_model = SentenceTransformer(EMBEDDING_MODEL)
38
 
39
 
 
5
 
6
 
7
  class SimpleVectorStore:
 
8
 
9
  def __init__(self, embeddings, documents):
10
  self.embeddings = embeddings
 
13
  self._build_index()
14
 
15
  def _build_index(self):
 
16
  texts = [doc.page_content for doc in self.documents]
17
  vectors = self.embeddings.encode(texts)
18
 
 
21
  self.index.add(np.array(vectors).astype('float32'))
22
 
23
  def similarity_search(self, query, k=3):
 
24
  query_vector = self.embeddings.encode([query])
25
  distances, indices = self.index.search(
26
  np.array(query_vector).astype('float32'),
 
30
  return [self.documents[i] for i in indices[0]]
31
 
32
 
 
33
  _embeddings_model = SentenceTransformer(EMBEDDING_MODEL)
34
 
35