siyu618 commited on
Commit
94f5c4b
·
verified ·
1 Parent(s): 2c4a06e

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/pdfs/Stream-Processing-with-Apache-Flink.pdf filter=lfs diff=lfs merge=lfs -text
config/__init__.py ADDED
File without changes
config/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (157 Bytes). View file
 
config/__pycache__/rag_config.cpython-312.pyc ADDED
Binary file (1.37 kB). View file
 
config/rag_config.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class RAGConfig:
5
+ # Embedding 模型
6
+ embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
7
+ normalize_embeddings: bool = True
8
+
9
+ # 检索参数
10
+ top_k: int = 5
11
+ similarity_threshold: float = 0.4
12
+
13
+ # LLM 模型
14
+ llm_model_name: str = "facebook/rag-token-base"
15
+ llm_max_length: int = 512
16
+ generation_kwargs: dict = None
17
+
18
+ # PDF 路径
19
+ pdf_dir: str = "data/pdfs"
20
+ vector_db_path: str = "data/embeddings/vector_store.pkl"
21
+
22
+ # Chunk 配置
23
+ chunk_size: int = 500
24
+ chunk_overlap: int = 100
25
+
26
+ def __post_init__(self):
27
+ if self.generation_kwargs is None:
28
+ self.generation_kwargs = {
29
+ "max_new_tokens": 200,
30
+ "temperature": 0.7,
31
+ "do_sample": True,
32
+ }
data/embeddings/vector_store.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0de8411ab4d21cfde6fcf8fc5db133064929163187c4c63203f0a842cf365df2
3
+ size 3286005
data/pdfs/Stream-Processing-with-Apache-Flink.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f4804c75b17f898d45a811973ff188878d876a886dde28be43f0aaabed6bfc0
3
+ size 10182829
prepare_embeddings.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from tqdm import tqdm
4
+ from config.rag_config import RAGConfig
5
+ from src.pdf_loader import load_pdfs, chunk_text
6
+ from src.embedder import Embedder
7
+
8
+ def prepare_embeddings():
9
+ config = RAGConfig()
10
+ embedder = Embedder(config)
11
+
12
+ print(f"📂 加载 PDF 文件...")
13
+ pdf_texts = load_pdfs(config.pdf_dir)
14
+
15
+ print("✂️ 切分文本...")
16
+ all_chunks = []
17
+ for text in pdf_texts:
18
+ all_chunks.extend(chunk_text(text, config.chunk_size, config.chunk_overlap))
19
+
20
+ print("🧠 生成 embeddings...")
21
+ embeddings = embedder.embed_texts(all_chunks)
22
+
23
+ os.makedirs(os.path.dirname(config.vector_db_path), exist_ok=True)
24
+ with open(config.vector_db_path, "wb") as f:
25
+ pickle.dump({"texts": all_chunks, "embeddings": embeddings}, f)
26
+
27
+ print(f"✅ 向量库已保存到 {config.vector_db_path}, 共 {len(all_chunks)} 段文本")
28
+
29
+ if __name__ == "__main__":
30
+ prepare_embeddings()
requirements.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformers + RAG
2
+ transformers>=4.43.0
3
+
4
+ # Sentence Transformers 向量嵌入
5
+ sentence-transformers>=2.2.2
6
+
7
+ # Tokenizers
8
+ tokenizers>=0.13.3
9
+
10
+ # Hugging Face Hub
11
+ huggingface_hub>=0.16.4
12
+ datasets>=2.12.0
13
+
14
+ # FAISS 向量检索
15
+ faiss-cpu>=1.7.4
16
+
17
+ # LangChain + 社区模块
18
+ langchain>=0.2.19
19
+ langchain-community>=0.1.16
20
+
21
+ # PDF 解析
22
+ pypdf>=3.15.0
23
+
24
+ # FastAPI + REST API
25
+ fastapi>=0.102.0
26
+ uvicorn>=0.23.0
27
+
28
+ torch
29
+ transformers
30
+ sentence-transformers
31
+ scikit-learn
32
+ gradio
33
+ pypdf
34
+ numpy
35
+ tqdm
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (154 Bytes). View file
 
src/__pycache__/embedder.cpython-312.pyc ADDED
Binary file (1.16 kB). View file
 
src/__pycache__/pdf_loader.cpython-312.pyc ADDED
Binary file (1.58 kB). View file
 
src/__pycache__/rag_pipeline.cpython-312.pyc ADDED
Binary file (2.14 kB). View file
 
src/__pycache__/retriever.cpython-312.pyc ADDED
Binary file (1.66 kB). View file
 
src/embedder.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from sklearn.preprocessing import normalize
3
+ from config.rag_config import RAGConfig
4
+
5
+ class Embedder:
6
+ def __init__(self, config: RAGConfig):
7
+ self.model = SentenceTransformer(config.embedding_model_name)
8
+ self.normalize = config.normalize_embeddings
9
+
10
+ def embed_texts(self, texts):
11
+ embeddings = self.model.encode(texts, convert_to_numpy=True)
12
+ if self.normalize:
13
+ embeddings = normalize(embeddings)
14
+ return embeddings
src/pdf_loader.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pypdf import PdfReader
3
+ from config.rag_config import RAGConfig
4
+
5
+ def load_pdfs(pdf_dir=None):
6
+ pdf_dir = pdf_dir or RAGConfig().pdf_dir
7
+ texts = []
8
+ for filename in os.listdir(pdf_dir):
9
+ if filename.endswith(".pdf"):
10
+ path = os.path.join(pdf_dir, filename)
11
+ reader = PdfReader(path)
12
+ text = "\n".join(page.extract_text() or "" for page in reader.pages)
13
+ texts.append(text)
14
+ return texts
15
+
16
+ def chunk_text(text, chunk_size=500, overlap=100):
17
+ chunks = []
18
+ start = 0
19
+ while start < len(text):
20
+ end = start + chunk_size
21
+ chunks.append(text[start:end])
22
+ start += chunk_size - overlap
23
+ return chunks
src/rag_pipeline.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import RagTokenizer, RagSequenceForGeneration
2
+ from config.rag_config import RAGConfig
3
+ from src.embedder import Embedder
4
+ from src.retriever import Retriever
5
+
6
+ class RAGPipeline:
7
+ def __init__(self, config: RAGConfig, docs, doc_embeddings):
8
+ self.config = config
9
+ self.embedder = Embedder(config)
10
+ self.retriever = Retriever(doc_embeddings, docs, config)
11
+ self.tokenizer = RagTokenizer.from_pretrained(config.llm_model_name)
12
+ self.model = RagSequenceForGeneration.from_pretrained(config.llm_model_name)
13
+
14
+ def ask(self, query):
15
+ query_emb = self.embedder.embed_texts([query])[0]
16
+ retrieved = self.retriever.retrieve(query_emb)
17
+ context = "\n".join([r[0] for r in retrieved])
18
+ input_text = f"Question: {query}\nContext: {context}"
19
+ inputs = self.tokenizer(input_text, return_tensors="pt")
20
+ output = self.model.generate(
21
+ **inputs,
22
+ **self.config.generation_kwargs
23
+ )
24
+ return self.tokenizer.batch_decode(output, skip_special_tokens=True)[0], retrieved
src/retriever.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+ from config.rag_config import RAGConfig
4
+
5
+ class Retriever:
6
+ def __init__(self, embeddings, texts, config: RAGConfig):
7
+ self.embeddings = embeddings
8
+ self.texts = texts
9
+ self.top_k = config.top_k
10
+ self.threshold = config.similarity_threshold
11
+
12
+ def retrieve(self, query_embedding):
13
+ scores = cosine_similarity([query_embedding], self.embeddings)[0]
14
+ # 阈值过滤
15
+ filtered = [(self.texts[i], float(scores[i]))
16
+ for i in np.argsort(scores)[::-1]
17
+ if scores[i] >= self.threshold]
18
+ results = filtered[:self.top_k]
19
+ if not results:
20
+ best_idx = int(np.argmax(scores))
21
+ results = [(self.texts[best_idx], float(scores[best_idx]))]
22
+ return results