Spaces:
Sleeping
Sleeping
João Lima
commited on
Commit
·
26b761b
1
Parent(s):
c4ff6b7
changing gradio version
Browse files- app.py +3 -3
- ingestion/pdf.py +22 -11
- rag/retriever.py +0 -4
app.py
CHANGED
|
@@ -16,13 +16,13 @@ def load_document(file):
|
|
| 16 |
|
| 17 |
def ask(question):
|
| 18 |
if vectorstore is None:
|
| 19 |
-
return "
|
| 20 |
if not question.strip():
|
| 21 |
-
return "
|
| 22 |
try:
|
| 23 |
return run_rag(question, vectorstore)
|
| 24 |
except Exception as e:
|
| 25 |
-
return f"
|
| 26 |
|
| 27 |
with gr.Blocks(title="Tech Explainer RAG") as demo:
|
| 28 |
gr.Markdown("# Tech Explainer — RAG with Automatic Evaluation")
|
|
|
|
| 16 |
|
| 17 |
def ask(question):
|
| 18 |
if vectorstore is None:
|
| 19 |
+
return "Upload a document first", "", ""
|
| 20 |
if not question.strip():
|
| 21 |
+
return "Please enter a question", "", ""
|
| 22 |
try:
|
| 23 |
return run_rag(question, vectorstore)
|
| 24 |
except Exception as e:
|
| 25 |
+
return f"Error: {str(e)}", "", ""
|
| 26 |
|
| 27 |
with gr.Blocks(title="Tech Explainer RAG") as demo:
|
| 28 |
gr.Markdown("# Tech Explainer — RAG with Automatic Evaluation")
|
ingestion/pdf.py
CHANGED
|
@@ -1,22 +1,33 @@
|
|
| 1 |
from pypdf import PdfReader
|
| 2 |
-
from langchain_text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
-
from langchain_core.document import Document
|
| 4 |
from rag.retriever import build_vectorstore
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def process_pdf(file_path):
|
| 8 |
-
"""
|
| 9 |
-
file_path: string - caminho para o arquivo PDF
|
| 10 |
-
"""
|
| 11 |
reader = PdfReader(file_path)
|
| 12 |
text = "".join(page.extract_text() or "" for page in reader.pages)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
chunk_size=500,
|
| 16 |
-
chunk_overlap=100
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
chunks = splitter.split_text(text)
|
| 20 |
documents = [Document(page_content=c) for c in chunks]
|
| 21 |
|
| 22 |
return build_vectorstore(documents)
|
|
|
|
| 1 |
from pypdf import PdfReader
|
|
|
|
|
|
|
| 2 |
from rag.retriever import build_vectorstore
|
| 3 |
|
| 4 |
|
| 5 |
+
class Document:
|
| 6 |
+
def __init__(self, page_content, metadata=None):
|
| 7 |
+
self.page_content = page_content
|
| 8 |
+
self.metadata = metadata or {}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def split_text(text, chunk_size=500, chunk_overlap=100):
|
| 12 |
+
chunks = []
|
| 13 |
+
start = 0
|
| 14 |
+
text_length = len(text)
|
| 15 |
+
|
| 16 |
+
while start < text_length:
|
| 17 |
+
end = start + chunk_size
|
| 18 |
+
chunk = text[start:end]
|
| 19 |
+
if chunk.strip():
|
| 20 |
+
chunks.append(chunk)
|
| 21 |
+
start += chunk_size - chunk_overlap
|
| 22 |
+
|
| 23 |
+
return chunks
|
| 24 |
+
|
| 25 |
+
|
| 26 |
def process_pdf(file_path):
|
|
|
|
|
|
|
|
|
|
| 27 |
reader = PdfReader(file_path)
|
| 28 |
text = "".join(page.extract_text() or "" for page in reader.pages)
|
| 29 |
|
| 30 |
+
chunks = split_text(text, chunk_size=500, chunk_overlap=100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
documents = [Document(page_content=c) for c in chunks]
|
| 32 |
|
| 33 |
return build_vectorstore(documents)
|
rag/retriever.py
CHANGED
|
@@ -5,7 +5,6 @@ from config import EMBEDDING_MODEL
|
|
| 5 |
|
| 6 |
|
| 7 |
class SimpleVectorStore:
|
| 8 |
-
"""Vectorstore simples usando FAISS"""
|
| 9 |
|
| 10 |
def __init__(self, embeddings, documents):
|
| 11 |
self.embeddings = embeddings
|
|
@@ -14,7 +13,6 @@ class SimpleVectorStore:
|
|
| 14 |
self._build_index()
|
| 15 |
|
| 16 |
def _build_index(self):
|
| 17 |
-
"""Constrói índice FAISS"""
|
| 18 |
texts = [doc.page_content for doc in self.documents]
|
| 19 |
vectors = self.embeddings.encode(texts)
|
| 20 |
|
|
@@ -23,7 +21,6 @@ class SimpleVectorStore:
|
|
| 23 |
self.index.add(np.array(vectors).astype('float32'))
|
| 24 |
|
| 25 |
def similarity_search(self, query, k=3):
|
| 26 |
-
"""Busca os k documentos mais similares"""
|
| 27 |
query_vector = self.embeddings.encode([query])
|
| 28 |
distances, indices = self.index.search(
|
| 29 |
np.array(query_vector).astype('float32'),
|
|
@@ -33,7 +30,6 @@ class SimpleVectorStore:
|
|
| 33 |
return [self.documents[i] for i in indices[0]]
|
| 34 |
|
| 35 |
|
| 36 |
-
# Carrega o modelo de embeddings uma vez
|
| 37 |
_embeddings_model = SentenceTransformer(EMBEDDING_MODEL)
|
| 38 |
|
| 39 |
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
class SimpleVectorStore:
|
|
|
|
| 8 |
|
| 9 |
def __init__(self, embeddings, documents):
|
| 10 |
self.embeddings = embeddings
|
|
|
|
| 13 |
self._build_index()
|
| 14 |
|
| 15 |
def _build_index(self):
|
|
|
|
| 16 |
texts = [doc.page_content for doc in self.documents]
|
| 17 |
vectors = self.embeddings.encode(texts)
|
| 18 |
|
|
|
|
| 21 |
self.index.add(np.array(vectors).astype('float32'))
|
| 22 |
|
| 23 |
def similarity_search(self, query, k=3):
|
|
|
|
| 24 |
query_vector = self.embeddings.encode([query])
|
| 25 |
distances, indices = self.index.search(
|
| 26 |
np.array(query_vector).astype('float32'),
|
|
|
|
| 30 |
return [self.documents[i] for i in indices[0]]
|
| 31 |
|
| 32 |
|
|
|
|
| 33 |
_embeddings_model = SentenceTransformer(EMBEDDING_MODEL)
|
| 34 |
|
| 35 |
|