Spaces:
Sleeping
Sleeping
Upload 18 files
Browse files- .gitattributes +1 -0
- config/__init__.py +0 -0
- config/__pycache__/__init__.cpython-312.pyc +0 -0
- config/__pycache__/rag_config.cpython-312.pyc +0 -0
- config/rag_config.py +32 -0
- data/embeddings/vector_store.pkl +3 -0
- data/pdfs/Stream-Processing-with-Apache-Flink.pdf +3 -0
- prepare_embeddings.py +30 -0
- requirements.txt +35 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-312.pyc +0 -0
- src/__pycache__/embedder.cpython-312.pyc +0 -0
- src/__pycache__/pdf_loader.cpython-312.pyc +0 -0
- src/__pycache__/rag_pipeline.cpython-312.pyc +0 -0
- src/__pycache__/retriever.cpython-312.pyc +0 -0
- src/embedder.py +14 -0
- src/pdf_loader.py +23 -0
- src/rag_pipeline.py +24 -0
- src/retriever.py +22 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/pdfs/Stream-Processing-with-Apache-Flink.pdf filter=lfs diff=lfs merge=lfs -text
|
config/__init__.py
ADDED
|
File without changes
|
config/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (157 Bytes). View file
|
|
|
config/__pycache__/rag_config.cpython-312.pyc
ADDED
|
Binary file (1.37 kB). View file
|
|
|
config/rag_config.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
@dataclass
|
| 4 |
+
class RAGConfig:
|
| 5 |
+
# Embedding 模型
|
| 6 |
+
embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
|
| 7 |
+
normalize_embeddings: bool = True
|
| 8 |
+
|
| 9 |
+
# 检索参数
|
| 10 |
+
top_k: int = 5
|
| 11 |
+
similarity_threshold: float = 0.4
|
| 12 |
+
|
| 13 |
+
# LLM 模型
|
| 14 |
+
llm_model_name: str = "facebook/rag-token-base"
|
| 15 |
+
llm_max_length: int = 512
|
| 16 |
+
generation_kwargs: dict = None
|
| 17 |
+
|
| 18 |
+
# PDF 路径
|
| 19 |
+
pdf_dir: str = "data/pdfs"
|
| 20 |
+
vector_db_path: str = "data/embeddings/vector_store.pkl"
|
| 21 |
+
|
| 22 |
+
# Chunk 配置
|
| 23 |
+
chunk_size: int = 500
|
| 24 |
+
chunk_overlap: int = 100
|
| 25 |
+
|
| 26 |
+
def __post_init__(self):
|
| 27 |
+
if self.generation_kwargs is None:
|
| 28 |
+
self.generation_kwargs = {
|
| 29 |
+
"max_new_tokens": 200,
|
| 30 |
+
"temperature": 0.7,
|
| 31 |
+
"do_sample": True,
|
| 32 |
+
}
|
data/embeddings/vector_store.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0de8411ab4d21cfde6fcf8fc5db133064929163187c4c63203f0a842cf365df2
|
| 3 |
+
size 3286005
|
data/pdfs/Stream-Processing-with-Apache-Flink.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f4804c75b17f898d45a811973ff188878d876a886dde28be43f0aaabed6bfc0
|
| 3 |
+
size 10182829
|
prepare_embeddings.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
from config.rag_config import RAGConfig
|
| 5 |
+
from src.pdf_loader import load_pdfs, chunk_text
|
| 6 |
+
from src.embedder import Embedder
|
| 7 |
+
|
| 8 |
+
def prepare_embeddings():
|
| 9 |
+
config = RAGConfig()
|
| 10 |
+
embedder = Embedder(config)
|
| 11 |
+
|
| 12 |
+
print(f"📂 加载 PDF 文件...")
|
| 13 |
+
pdf_texts = load_pdfs(config.pdf_dir)
|
| 14 |
+
|
| 15 |
+
print("✂️ 切分文本...")
|
| 16 |
+
all_chunks = []
|
| 17 |
+
for text in pdf_texts:
|
| 18 |
+
all_chunks.extend(chunk_text(text, config.chunk_size, config.chunk_overlap))
|
| 19 |
+
|
| 20 |
+
print("🧠 生成 embeddings...")
|
| 21 |
+
embeddings = embedder.embed_texts(all_chunks)
|
| 22 |
+
|
| 23 |
+
os.makedirs(os.path.dirname(config.vector_db_path), exist_ok=True)
|
| 24 |
+
with open(config.vector_db_path, "wb") as f:
|
| 25 |
+
pickle.dump({"texts": all_chunks, "embeddings": embeddings}, f)
|
| 26 |
+
|
| 27 |
+
print(f"✅ 向量库已保存到 {config.vector_db_path}, 共 {len(all_chunks)} 段文本")
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
prepare_embeddings()
|
requirements.txt
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Transformers + RAG
|
| 2 |
+
transformers>=4.43.0
|
| 3 |
+
|
| 4 |
+
# Sentence Transformers 向量嵌入
|
| 5 |
+
sentence-transformers>=2.2.2
|
| 6 |
+
|
| 7 |
+
# Tokenizers
|
| 8 |
+
tokenizers>=0.13.3
|
| 9 |
+
|
| 10 |
+
# Hugging Face Hub
|
| 11 |
+
huggingface_hub>=0.16.4
|
| 12 |
+
datasets>=2.12.0
|
| 13 |
+
|
| 14 |
+
# FAISS 向量检索
|
| 15 |
+
faiss-cpu>=1.7.4
|
| 16 |
+
|
| 17 |
+
# LangChain + 社区模块
|
| 18 |
+
langchain>=0.2.19
|
| 19 |
+
langchain-community>=0.1.16
|
| 20 |
+
|
| 21 |
+
# PDF 解析
|
| 22 |
+
pypdf>=3.15.0
|
| 23 |
+
|
| 24 |
+
# FastAPI + REST API
|
| 25 |
+
fastapi>=0.102.0
|
| 26 |
+
uvicorn>=0.23.0
|
| 27 |
+
|
| 28 |
+
torch
|
| 29 |
+
transformers
|
| 30 |
+
sentence-transformers
|
| 31 |
+
scikit-learn
|
| 32 |
+
gradio
|
| 33 |
+
pypdf
|
| 34 |
+
numpy
|
| 35 |
+
tqdm
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (154 Bytes). View file
|
|
|
src/__pycache__/embedder.cpython-312.pyc
ADDED
|
Binary file (1.16 kB). View file
|
|
|
src/__pycache__/pdf_loader.cpython-312.pyc
ADDED
|
Binary file (1.58 kB). View file
|
|
|
src/__pycache__/rag_pipeline.cpython-312.pyc
ADDED
|
Binary file (2.14 kB). View file
|
|
|
src/__pycache__/retriever.cpython-312.pyc
ADDED
|
Binary file (1.66 kB). View file
|
|
|
src/embedder.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from sklearn.preprocessing import normalize
|
| 3 |
+
from config.rag_config import RAGConfig
|
| 4 |
+
|
| 5 |
+
class Embedder:
|
| 6 |
+
def __init__(self, config: RAGConfig):
|
| 7 |
+
self.model = SentenceTransformer(config.embedding_model_name)
|
| 8 |
+
self.normalize = config.normalize_embeddings
|
| 9 |
+
|
| 10 |
+
def embed_texts(self, texts):
|
| 11 |
+
embeddings = self.model.encode(texts, convert_to_numpy=True)
|
| 12 |
+
if self.normalize:
|
| 13 |
+
embeddings = normalize(embeddings)
|
| 14 |
+
return embeddings
|
src/pdf_loader.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pypdf import PdfReader
|
| 3 |
+
from config.rag_config import RAGConfig
|
| 4 |
+
|
| 5 |
+
def load_pdfs(pdf_dir=None):
|
| 6 |
+
pdf_dir = pdf_dir or RAGConfig().pdf_dir
|
| 7 |
+
texts = []
|
| 8 |
+
for filename in os.listdir(pdf_dir):
|
| 9 |
+
if filename.endswith(".pdf"):
|
| 10 |
+
path = os.path.join(pdf_dir, filename)
|
| 11 |
+
reader = PdfReader(path)
|
| 12 |
+
text = "\n".join(page.extract_text() or "" for page in reader.pages)
|
| 13 |
+
texts.append(text)
|
| 14 |
+
return texts
|
| 15 |
+
|
| 16 |
+
def chunk_text(text, chunk_size=500, overlap=100):
|
| 17 |
+
chunks = []
|
| 18 |
+
start = 0
|
| 19 |
+
while start < len(text):
|
| 20 |
+
end = start + chunk_size
|
| 21 |
+
chunks.append(text[start:end])
|
| 22 |
+
start += chunk_size - overlap
|
| 23 |
+
return chunks
|
src/rag_pipeline.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import RagTokenizer, RagSequenceForGeneration
|
| 2 |
+
from config.rag_config import RAGConfig
|
| 3 |
+
from src.embedder import Embedder
|
| 4 |
+
from src.retriever import Retriever
|
| 5 |
+
|
| 6 |
+
class RAGPipeline:
|
| 7 |
+
def __init__(self, config: RAGConfig, docs, doc_embeddings):
|
| 8 |
+
self.config = config
|
| 9 |
+
self.embedder = Embedder(config)
|
| 10 |
+
self.retriever = Retriever(doc_embeddings, docs, config)
|
| 11 |
+
self.tokenizer = RagTokenizer.from_pretrained(config.llm_model_name)
|
| 12 |
+
self.model = RagSequenceForGeneration.from_pretrained(config.llm_model_name)
|
| 13 |
+
|
| 14 |
+
def ask(self, query):
|
| 15 |
+
query_emb = self.embedder.embed_texts([query])[0]
|
| 16 |
+
retrieved = self.retriever.retrieve(query_emb)
|
| 17 |
+
context = "\n".join([r[0] for r in retrieved])
|
| 18 |
+
input_text = f"Question: {query}\nContext: {context}"
|
| 19 |
+
inputs = self.tokenizer(input_text, return_tensors="pt")
|
| 20 |
+
output = self.model.generate(
|
| 21 |
+
**inputs,
|
| 22 |
+
**self.config.generation_kwargs
|
| 23 |
+
)
|
| 24 |
+
return self.tokenizer.batch_decode(output, skip_special_tokens=True)[0], retrieved
|
src/retriever.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 3 |
+
from config.rag_config import RAGConfig
|
| 4 |
+
|
| 5 |
+
class Retriever:
|
| 6 |
+
def __init__(self, embeddings, texts, config: RAGConfig):
|
| 7 |
+
self.embeddings = embeddings
|
| 8 |
+
self.texts = texts
|
| 9 |
+
self.top_k = config.top_k
|
| 10 |
+
self.threshold = config.similarity_threshold
|
| 11 |
+
|
| 12 |
+
def retrieve(self, query_embedding):
|
| 13 |
+
scores = cosine_similarity([query_embedding], self.embeddings)[0]
|
| 14 |
+
# 阈值过滤
|
| 15 |
+
filtered = [(self.texts[i], float(scores[i]))
|
| 16 |
+
for i in np.argsort(scores)[::-1]
|
| 17 |
+
if scores[i] >= self.threshold]
|
| 18 |
+
results = filtered[:self.top_k]
|
| 19 |
+
if not results:
|
| 20 |
+
best_idx = int(np.argmax(scores))
|
| 21 |
+
results = [(self.texts[best_idx], float(scores[best_idx]))]
|
| 22 |
+
return results
|