mryt66 commited on
Commit
a840639
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.index filter=lfs diff=lfs merge=lfs -text
37
+ *.db filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
Binary file (287 Bytes). View file
 
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 PORT=7860 HF_HOME=/root/.cache/huggingface
4
+
5
+ WORKDIR /app
6
+
7
+ # faiss / numpy performance dep
8
+ RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 && rm -rf /var/lib/apt/lists/*
9
+
10
+ COPY requirements.txt .
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Optional: prefetch embedding model to reduce first-request latency
14
+ RUN python - <<'PY' || true
15
+ from sentence_transformers import SentenceTransformer
16
+ SentenceTransformer('Qwen/Qwen3-Embedding-0.6B')
17
+ PY
18
+
19
+ COPY . .
20
+
21
+ EXPOSE 7860
22
+
23
+ # Start only the API
24
+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: 'Rag Chat '
3
+ emoji: 🐠
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
api.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import google.generativeai as genai
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+ from datetime import datetime
8
+ from contextlib import asynccontextmanager
9
+
10
+ from fastapi import FastAPI, Depends, HTTPException
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from sqlalchemy import Column, Integer, Text, DateTime, create_engine
13
+ from sqlalchemy.orm import declarative_base, sessionmaker, Session
14
+ from pydantic import BaseModel
15
+ import uvicorn
16
+ from starlette.concurrency import run_in_threadpool
17
+ import subprocess, sys
18
+
19
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
20
+
21
+ # Always use local data directory (no env var logic)
22
+ DATA_DIR = os.path.join(SCRIPT_DIR, "data")
23
+ os.makedirs(DATA_DIR, exist_ok=True)
24
+
25
+ OUTPUT_CHUNKS_FILE = os.path.join(SCRIPT_DIR, "output_chunks.jsonl")
26
+ RAG_CONFIG_FILE = os.path.join(SCRIPT_DIR, "rag_prompt_config.jsonl")
27
+ FAISS_INDEX_FILE = os.path.join(DATA_DIR, "faiss_index.index")
28
+ EMBEDDINGS_FILE = os.path.join(DATA_DIR, "chunk_embeddings.npy")
29
+ DATABASE_URL = f"sqlite:///{os.path.join(DATA_DIR, 'conversations.db')}"
30
+
31
+ Base = declarative_base()
32
+ engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
33
+ SessionLocal = sessionmaker(bind=engine)
34
+
35
+
36
+ # Database model
37
+ class Conversation(Base):
38
+ __tablename__ = "conversations"
39
+
40
+ id = Column(Integer, primary_key=True, index=True)
41
+ query = Column(Text)
42
+ response = Column(Text)
43
+ context = Column(Text)
44
+ base_context = Column(Text)
45
+ system_prompt = Column(Text)
46
+ full_prompt = Column(Text)
47
+ timestamp = Column(DateTime, default=datetime.utcnow)
48
+
49
+
50
+ # Pydantic models for API
51
+ class ChatResponse(BaseModel):
52
+ response: str
53
+ timestamp: datetime
54
+
55
+
56
+ class ChatRequest(BaseModel):
57
+ query: str
58
+ history: list[dict] | None = None # optional conversation history
59
+
60
+
61
+ # Initialize Gemini API
62
+ # API key is configured during app startup (lifespan) to avoid import-time failures
63
+
64
+
65
+ # Lifespan function to handle startup and shutdown
66
+ @asynccontextmanager
67
+ async def lifespan(app: FastAPI):
68
+ # Startup
69
+ print("Starting RAG Chat API...")
70
+ print(f"SQLite DB path: {os.path.join(DATA_DIR, 'conversations.db')}")
71
+ # Ensure tables now that directory is confirmed writable
72
+ Base.metadata.create_all(bind=engine)
73
+
74
+ # Configure Gemini here (fail early but at startup)
75
+ API_KEY = os.getenv("GEMINI_API_KEY")
76
+ if not API_KEY:
77
+ raise RuntimeError("Please set GEMINI_API_KEY environment variable")
78
+ genai.configure(api_key=API_KEY)
79
+
80
+ try:
81
+ success, chunks_count = initialize_system()
82
+ if success:
83
+ print(f"✅ RAG system initialized successfully with {chunks_count} chunks")
84
+ print("API ready at: http://localhost:8000")
85
+ print("API documentation at: http://localhost:8000/docs")
86
+ else:
87
+ print("❌ Failed to initialize RAG system")
88
+ raise RuntimeError("System initialization failed")
89
+ except Exception as e:
90
+ print(f"❌ Initialization error: {str(e)}")
91
+ raise RuntimeError(f"System initialization failed: {str(e)}")
92
+
93
+ yield # This is where the app runs
94
+
95
+ # Shutdown (if needed)
96
+ print("Shutting down RAG Chat API...")
97
+
98
+
99
+ # Initialize FastAPI app with lifespan
100
+ app = FastAPI(
101
+ title="RAG Chat API",
102
+ description="RAG Chat System with Database Integration",
103
+ lifespan=lifespan,
104
+ )
105
+
106
+ """API only module.
107
+
108
+ The web chat UI has been moved to a separate app (see web_app.py). This file now
109
+ exposes only the JSON API endpoints so it can be containerised independently or
110
+ scaled separately from the frontend.
111
+ """
112
+
113
+
114
+ # Enable CORS for local dev
115
+ app.add_middleware(
116
+ CORSMiddleware,
117
+ allow_origins=[
118
+ "http://localhost",
119
+ "http://localhost:3000",
120
+ "http://127.0.0.1:3000",
121
+ "http://localhost:8001", # web UI container
122
+ "http://127.0.0.1:8001",
123
+ ],
124
+ allow_credentials=True,
125
+ allow_methods=["*"],
126
+ allow_headers=["*"],
127
+ )
128
+
129
+ # Global variables to store precomputed data
130
+ chunks_data = None
131
+ chunk_embeddings = None
132
+ faiss_index = None
133
+ base_chunk = None
134
+ system_prompt = None
135
+ model_embedding = None
136
+
137
+
138
+ # Dependency to get database session
139
+ def get_db():
140
+ db = SessionLocal()
141
+ try:
142
+ yield db
143
+ finally:
144
+ db.close()
145
+
146
+
147
+ def load_chunks(json_file):
148
+ """Load chunks from JSON file"""
149
+ try:
150
+ with open(json_file, "r", encoding="utf-8") as file:
151
+ return json.load(file)
152
+ except FileNotFoundError:
153
+ raise FileNotFoundError(f"File {json_file} not found!")
154
+ except json.JSONDecodeError:
155
+ raise ValueError(f"Invalid JSON in {json_file}")
156
+
157
+
158
+ def compute_and_cache_embeddings(chunks):
159
+ """Compute embeddings for all chunks and cache them"""
160
+ global chunk_embeddings, faiss_index
161
+
162
+ print("Computing embeddings for all chunks...")
163
+ texts = [chunk["content"] for chunk in chunks]
164
+
165
+ # Load or compute embeddings
166
+ if os.path.exists(EMBEDDINGS_FILE):
167
+ print("Loading cached embeddings...")
168
+ chunk_embeddings = np.load(EMBEDDINGS_FILE)
169
+ if chunk_embeddings.shape[0] != len(texts):
170
+ print("Cached embeddings count mismatches chunks. Recomputing...")
171
+ chunk_embeddings = model_embedding.encode(
172
+ texts, convert_to_numpy=True
173
+ ).astype("float32")
174
+ np.save(EMBEDDINGS_FILE, chunk_embeddings)
175
+ else:
176
+ print("Computing new embeddings (this may take a moment)...")
177
+ chunk_embeddings = model_embedding.encode(texts, convert_to_numpy=True).astype(
178
+ "float32"
179
+ )
180
+ np.save(EMBEDDINGS_FILE, chunk_embeddings)
181
+ print("Embeddings cached for future use.")
182
+
183
+ # Normalize embeddings (for cosine similarity with IndexFlatIP)
184
+ faiss.normalize_L2(chunk_embeddings)
185
+
186
+ # Create or load FAISS index
187
+ embedding_dim = chunk_embeddings.shape[1]
188
+ if os.path.exists(FAISS_INDEX_FILE):
189
+ print("Loading cached FAISS index...")
190
+ faiss_index = faiss.read_index(FAISS_INDEX_FILE)
191
+ # Validate index matches embeddings
192
+ if getattr(faiss_index, "ntotal", 0) != chunk_embeddings.shape[0]:
193
+ print("FAISS index size mismatches embeddings. Rebuilding index...")
194
+ faiss_index = faiss.IndexFlatIP(embedding_dim)
195
+ faiss_index.add(chunk_embeddings)
196
+ faiss.write_index(faiss_index, FAISS_INDEX_FILE)
197
+ else:
198
+ print("Creating new FAISS index...")
199
+ faiss_index = faiss.IndexFlatIP(embedding_dim)
200
+ faiss_index.add(chunk_embeddings)
201
+ faiss.write_index(faiss_index, FAISS_INDEX_FILE)
202
+ print("FAISS index cached for future use.")
203
+
204
+
205
+ def retrieve_relevant_chunks(query, top_k=3):
206
+ """Retrieve most relevant chunks for the query using precomputed embeddings"""
207
+ global chunks_data, faiss_index
208
+
209
+ if faiss_index is None or chunks_data is None:
210
+ raise RuntimeError("RAG index not initialized")
211
+
212
+ # Encode query
213
+ query_embedding = model_embedding.encode([query], convert_to_numpy=True).astype(
214
+ "float32"
215
+ )
216
+ faiss.normalize_L2(query_embedding)
217
+
218
+ top_k = min(top_k, len(chunks_data))
219
+ # Search in precomputed index
220
+ _, indices = faiss_index.search(query_embedding, top_k)
221
+ return [chunks_data[i] for i in indices[0]]
222
+
223
+
224
+ def _format_history(history: list[dict] | None, max_turns: int = 6) -> str:
225
+ """Format recent conversation history for inclusion in the prompt."""
226
+ if not history:
227
+ return ""
228
+ recent = history[-max_turns:]
229
+ lines = []
230
+ for turn in recent:
231
+ role = turn.get("role", "user")
232
+ msg = (turn.get("message") or "").strip()
233
+ if not msg:
234
+ continue
235
+ prefix = "User" if role == "user" else "Assistant"
236
+ lines.append(f"{prefix}: {msg}")
237
+ return "\n".join(lines)
238
+
239
+
240
+ def construct_prompt(base_chunk, system_prompt, query, history_text: str = ""):
241
+ """Construct the full prompt with relevant context"""
242
+ relevant_chunks = retrieve_relevant_chunks(query)
243
+ context = "\n\n".join(chunk["content"] for chunk in relevant_chunks)
244
+ full_prompt = (
245
+ f"System prompt:\n{system_prompt['content']}\n\n"
246
+ f"Context:\n{context}\n\n"
247
+ f"{base_chunk['content']}\n\n"
248
+ )
249
+ if history_text:
250
+ full_prompt += f"Recent conversation:\n{history_text}\n\n"
251
+ full_prompt += f"Query:\n{query}"
252
+ return full_prompt, context
253
+
254
+
255
+ def get_answer(prompt):
256
+ """Get answer from Gemini API"""
257
+ try:
258
+ model = genai.GenerativeModel("gemini-2.5-flash")
259
+ response = model.generate_content(prompt)
260
+ return response.text
261
+ except Exception as e:
262
+ print(f"Error getting response from Gemini: {e}")
263
+ return None
264
+
265
+
266
+ def run_generate_rag_data():
267
+ """Run the data generation script if available."""
268
+ script_path = os.path.join(SCRIPT_DIR, "generate_rag_data.py")
269
+ if not os.path.isfile(script_path):
270
+ print("generate_rag_data.py not found; skipping automatic generation.")
271
+ return
272
+ print("Running generate_rag_data.py to build RAG data...")
273
+ try:
274
+ subprocess.run([sys.executable, script_path], cwd=SCRIPT_DIR, check=True)
275
+ print("generate_rag_data.py completed.")
276
+ except subprocess.CalledProcessError as e:
277
+ raise RuntimeError(f"generate_rag_data.py failed (exit {e.returncode})") from e
278
+
279
+
280
+ def initialize_system():
281
+ """Initialize the RAG system with precomputed embeddings"""
282
+ global chunks_data, base_chunk, system_prompt, model_embedding
283
+
284
+ try:
285
+ # If embeddings or required JSON files are missing, (re)generate data first.
286
+ need_generation = (
287
+ not os.path.exists(EMBEDDINGS_FILE)
288
+ or not os.path.exists(OUTPUT_CHUNKS_FILE)
289
+ or not os.path.exists(RAG_CONFIG_FILE)
290
+ )
291
+ if need_generation:
292
+ print("RAG data or embeddings missing. Triggering data generation...")
293
+ run_generate_rag_data()
294
+
295
+ # Initialize embedding model
296
+ print("Loading embedding model...")
297
+ model_embedding = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
298
+
299
+ # Load configurations
300
+ print("Loading chunks and configuration...")
301
+ chunks_data = load_chunks(OUTPUT_CHUNKS_FILE)
302
+ config = load_chunks(RAG_CONFIG_FILE)[0]
303
+ base_chunk = config["base_chunk"]
304
+ system_prompt = config["system_prompt"]
305
+
306
+ print(f"Loaded {len(chunks_data)} chunks from knowledge base")
307
+
308
+ # Precompute embeddings once (will compute if file absent)
309
+ compute_and_cache_embeddings(chunks_data)
310
+
311
+ print("System initialized successfully!")
312
+ return True, len(chunks_data)
313
+
314
+ except Exception as e:
315
+ print(f"Failed to initialize system: {e}")
316
+ return False, 0
317
+
318
+
319
+ @app.post("/chat", response_model=ChatResponse)
320
+ async def chat_endpoint(payload: ChatRequest, db: Session = Depends(get_db)):
321
+ """Chat endpoint that processes queries and saves conversations to database
322
+
323
+ Accepts a JSON body: {"query": "..."
324
+ """
325
+ global base_chunk, system_prompt
326
+
327
+ query = (payload.query or "").strip()
328
+ if not query:
329
+ raise HTTPException(status_code=400, detail="Query cannot be empty")
330
+
331
+ try:
332
+ # Construct prompt and get answer
333
+ history_text = _format_history(payload.history)
334
+ full_prompt, context = construct_prompt(
335
+ base_chunk, system_prompt, query, history_text
336
+ )
337
+
338
+ # Avoid blocking the event loop with a sync network call
339
+ answer = await run_in_threadpool(get_answer, full_prompt)
340
+ if not answer:
341
+ answer = "Sorry, I failed to get a response from Gemini. Please try again."
342
+
343
+ # Save conversation to database
344
+ conversation = Conversation(
345
+ query=query,
346
+ response=answer,
347
+ context=context,
348
+ base_context=base_chunk["content"],
349
+ system_prompt=system_prompt["content"],
350
+ full_prompt=full_prompt,
351
+ )
352
+
353
+ db.add(conversation)
354
+ db.commit()
355
+
356
+ return ChatResponse(response=answer, timestamp=conversation.timestamp)
357
+
358
+ except Exception as e:
359
+ db.rollback()
360
+ raise HTTPException(status_code=500, detail=f"Chat processing error: {str(e)}")
361
+
362
+
363
+ # Simple health probe
364
+ @app.get("/health")
365
+ def health():
366
+ return {"status": "ok"}
367
+
368
+
369
+ if __name__ == "__main__":
370
+ import os
371
+
372
+ # Check environment variable
373
+ if not os.getenv("GEMINI_API_KEY"):
374
+ print("Warning: GEMINI_API_KEY environment variable not set!")
375
+ print("Please set it with: set GEMINI_API_KEY=your_api_key_here")
376
+ exit(1)
377
+
378
+ print("Starting RAG Chat API server...")
379
+ uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True, log_level="info")
docker-compose.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+ services:
3
+ rag-api:
4
+ build:
5
+ context: .
6
+ args:
7
+ SERVICE_TARGET: api
8
+ image: rag-chat-api:latest
9
+ container_name: rag-chat-api
10
+ environment:
11
+ - GEMINI_API_KEY=${GEMINI_API_KEY}
12
+ volumes:
13
+ - ./data:/app/data
14
+ - ./output_chunks.jsonl:/app/output_chunks.jsonl:ro
15
+ - ./rag_prompt_config.jsonl:/app/rag_prompt_config.jsonl:ro
16
+ ports:
17
+ - "8000:8000"
18
+ restart: unless-stopped
19
+ rag-web:
20
+ build:
21
+ context: .
22
+ args:
23
+ SERVICE_TARGET: web
24
+ image: rag-chat-web:latest
25
+ container_name: rag-chat-web
26
+ environment:
27
+ - API_BASE=http://rag-api:8000
28
+ depends_on:
29
+ - rag-api
30
+ ports:
31
+ - "8001:8001"
32
+ restart: unless-stopped
generate_rag_data.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google import genai
2
+ from typing import List
3
+ from pathlib import Path
4
+ import fitz
5
+ import json
6
+ import os
7
+ import textwrap
8
+ from settings import Chunk, Settings
9
+
10
+
11
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
12
+ DATA_DIR = os.path.join(SCRIPT_DIR, "data")
13
+ os.makedirs(DATA_DIR, exist_ok=True)
14
+
15
+ # Input: put your raw source files (txt/markdown) inside ./data/source
16
+ SOURCE_DIR = os.path.join(DATA_DIR, "source")
17
+ os.makedirs(SOURCE_DIR, exist_ok=True)
18
+
19
+ # Output artifact locations (align with api.py expectations)
20
+ OUTPUT_CHUNKS_FILE = os.path.join(
21
+ SCRIPT_DIR, "output_chunks.jsonl"
22
+ ) # already used in api.py
23
+ RAG_CONFIG_FILE = os.path.join(
24
+ SCRIPT_DIR, "rag_prompt_config.jsonl"
25
+ ) # already used in api.py
26
+ # If you also want these in data/ instead, uncomment:
27
+ # OUTPUT_CHUNKS_FILE = os.path.join(DATA_DIR, "output_chunks.jsonl")
28
+ # RAG_CONFIG_FILE = os.path.join(DATA_DIR, "rag_prompt_config.jsonl")
29
+
30
+ # Example system / base prompts (edit as needed)
31
+ SYSTEM_PROMPT = {
32
+ "role": "system",
33
+ "content": "You are a helpful RAG assistant. Use only the provided context. If unsure, say you don't know.",
34
+ }
35
+ BASE_CHUNK = {
36
+ "role": "base",
37
+ "content": "Answer the user's query using only the contextual chunks below.",
38
+ }
39
+
40
+
41
+ def extract_pdf_text(filename: str) -> str:
42
+ text = ""
43
+ with fitz.open(filename) as doc:
44
+ for page in doc:
45
+ text += page.get_text()
46
+ return text
47
+
48
+
49
+ def chunk_pdf(filename: str) -> List[Chunk]:
50
+ client = genai.Client()
51
+ text = extract_pdf_text(filename)
52
+ # print(text)
53
+ pdf_name = Path(filename).name
54
+
55
+ prompt = f"""
56
+ Split the following text into coherent chunks suitable for RAG.
57
+ Each chunk should be 100-500 words.
58
+ Do not cut mid-sentence, paragraph, or table.
59
+ Preserve headings, bullet points, and tables.
60
+
61
+ Return an array of JSON objects with this structure:
62
+ {{
63
+ "content": "<chunk text>",
64
+ "source": "{pdf_name}",
65
+ "tags": [],
66
+ "type": "prg"
67
+ }}
68
+ Text:
69
+ {text}
70
+ """
71
+
72
+ client = genai.Client()
73
+ response = client.models.generate_content(
74
+ model="gemini-2.5-flash",
75
+ contents=prompt,
76
+ config={
77
+ "response_mime_type": "application/json",
78
+ "response_schema": Settings.response_schema,
79
+ },
80
+ )
81
+
82
+ chunks: List[Chunk] = response.parsed
83
+ return chunks
84
+
85
+
86
+ def process_pdf_folder(folder_path):
87
+ folder = Path(folder_path)
88
+ pdfs = list(folder.glob("*.pdf"))
89
+ all_chunks = []
90
+ if not pdfs:
91
+ print(f"No PDF files found in {folder_path}")
92
+ return []
93
+ else:
94
+ pdfs.sort(key=lambda x: x.name)
95
+ for pdf_file in pdfs:
96
+ print(f"Processing PDF: {pdf_file.name}")
97
+ chunks = chunk_pdf(filename=pdf_file)
98
+ all_chunks.extend(chunks)
99
+ return all_chunks
100
+
101
+
102
+ def make_prg_chunk(text, filename):
103
+ return [
104
+ {
105
+ "content": text.strip(),
106
+ "source": Path(filename).name,
107
+ "tags": [],
108
+ "type": "prg",
109
+ }
110
+ ]
111
+
112
+
113
+ def process_prg_folder(folder_path):
114
+ folder = Path(folder_path)
115
+ all_chunks = []
116
+ prgs = list(folder.glob("*.prg"))
117
+ if not prgs:
118
+ print(f"No .prg files found in {folder_path}")
119
+ return []
120
+ prgs.sort(key=lambda x: x.name)
121
+ for prg_file in prgs:
122
+ print(f"Processing PRG: {prg_file.name}")
123
+ text = prg_file.read_text(encoding="utf-8", errors="ignore")
124
+ chunk = make_prg_chunk(text, prg_file.name)
125
+ all_chunks.extend(chunk)
126
+ return all_chunks
127
+
128
+
129
+ def read_source_files():
130
+ """Load all .txt / .md files from SOURCE_DIR."""
131
+ files = []
132
+ for name in os.listdir(SOURCE_DIR):
133
+ if name.lower().endswith((".txt", ".md")):
134
+ path = os.path.join(SOURCE_DIR, name)
135
+ with open(path, "r", encoding="utf-8") as f:
136
+ files.append((name, f.read()))
137
+ if not files:
138
+ # Provide a fallback demo file if none exist
139
+ demo_path = os.path.join(SOURCE_DIR, "demo.txt")
140
+ demo_text = (
141
+ "This is a demo knowledge file.\n"
142
+ "Add your project or domain documentation as .txt or .md files here."
143
+ )
144
+ with open(demo_path, "w", encoding="utf-8") as f:
145
+ f.write(demo_text)
146
+ files.append(("demo.txt", demo_text))
147
+ return files
148
+
149
+
150
+ def chunk_text(text: str, max_chars: int = 1200, overlap: int = 150):
151
+ """Simple character-based chunking with overlap."""
152
+ text = text.strip()
153
+ if not text:
154
+ return []
155
+ chunks = []
156
+ start = 0
157
+ while start < len(text):
158
+ end = min(len(text), start + max_chars)
159
+ chunk = text[start:end]
160
+ chunks.append(chunk.strip())
161
+ if end >= len(text):
162
+ break
163
+ start = end - overlap
164
+ if start < 0:
165
+ start = 0
166
+ return chunks
167
+
168
+
169
+ def build_chunks():
170
+ """Create chunk objects suitable for embedding."""
171
+ all_files = read_source_files()
172
+ chunks = []
173
+ idx = 0
174
+ for filename, content in all_files:
175
+ parts = chunk_text(content)
176
+ for part in parts:
177
+ chunks.append({"id": idx, "source": filename, "content": part})
178
+ idx += 1
179
+ return chunks
180
+
181
+
182
+ def write_jsonl(path: str, records):
183
+ with open(path, "w", encoding="utf-8") as f:
184
+ for r in records:
185
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
186
+
187
+
188
+ def write_config(path: str):
189
+ """Write system + base prompt config file (list with single object)."""
190
+ obj = [{"system_prompt": SYSTEM_PROMPT, "base_chunk": BASE_CHUNK}]
191
+ with open(path, "w", encoding="utf-8") as f:
192
+ json.dump(obj, f, ensure_ascii=False, indent=2)
193
+
194
+
195
+ def main():
196
+ pdf_folder = r"C:\Users\kogut\Python\Assembler_rag\data\pdfs"
197
+ prg_folder = r"C:\Users\kogut\Python\Assembler_rag\data\prg"
198
+ # pdf_folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./data/pdfs")
199
+ # prg_folder = Path(sys.argv[2]) if len(sys.argv) > 2 else None
200
+ output_jsonl = "output_chunks.jsonl"
201
+
202
+ all_chunks = process_pdf_folder(pdf_folder)
203
+
204
+ if prg_folder:
205
+ all_chunks += process_prg_folder(prg_folder)
206
+
207
+ with open(output_jsonl, "w", encoding="utf-8") as f:
208
+ json.dump(all_chunks, f, ensure_ascii=False, indent=2)
209
+
210
+ print(f"Finished. {len(all_chunks)} total chunks written to {output_jsonl}")
211
+
212
+ print(f"Generating RAG data from: {SOURCE_DIR}")
213
+ chunks = build_chunks()
214
+ print(f"Built {len(chunks)} chunks")
215
+ write_jsonl(OUTPUT_CHUNKS_FILE, chunks)
216
+ write_config(RAG_CONFIG_FILE)
217
+ print(f"Wrote chunks to: {OUTPUT_CHUNKS_FILE}")
218
+ print(f"Wrote config to: {RAG_CONFIG_FILE}")
219
+ print("Done.")
220
+
221
+
222
+ if __name__ == "__main__":
223
+ main()
output_chunks.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
rag_prompt_config.jsonl ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "base_chunk": {
4
+ "content": "Pisząc program w języku asemblera mMszyny W posługujemy się niemal wyłącznie instrukcjami\nodpowiadającymi rozkazom danego procesora. Dlatego konieczne jest poznanie tych rozkazów,\npodobnie jak zaznajomienie się z architekturą tegoż procesora. Program w języku asemblera tworzą\nrozkazy oraz dane, na których te rozkazy działają. Program składa się z kolejnych linii, w każdej\nlinii może znaleźć się jeden rozkaz lub deklaracja pojedynczej danej. Formalnie składnia linii\nprogramu jest następująca:\n[<etykieta>:] <rozkaz lub pseudorozkaz> [<argument>]\ngdzie:\n<etykieta> – ciąg liter i cyfr będący symboliczną reprezentacją określonego adresu\n<rozkaz lub pseudorozkaz> – symboliczna nazwa jednego z rozkazów procesora\nlub jednego z tzw. pseudorozkazów rezerwacji miejsca w pamięci na dane (RST,\nRPA)\n<argument> – liczba dziesiętna lub jedna z etykiet wprowadzonych na początku linii\nPseudorozkazy RST i RPA pozwalają odpowiednio zarezerwować miejsce w pamięci na\npojedynczą daną o ustalonej (jako argument) wartości początkowej oraz zarezerwować miejsce\nw pamięci na daną bez wskazywania jej wartości początkowej.(aby zarezerwować miejsce na daną możemy zrobić <etykieta> RST <wartość> lub <etykieta> RPA). Jako rozkaz może pojawić się\nnazwa jednego z dostępnych rozkazów. Przyjmiemy, że w procesorze maszyny W dostępnych jest 8\nrozkazów wymienionych w poniższej tabeli.\nNazwa Kod Działanie\nSTP 000 Zatrzymanie (zakończenie) pracy programu\nDOD 001 Dodanie do akumulatora zawartości komórki pamięci wskazanej\nprzez argument\nODE 010 Odjęcie od akumulatora zawartości komórki pamięci wskazanej\nprzez argument\nPOB 011 Pobranie do akumulatora zawartości komórki pamięci wskazanej\nprzez argument\nŁAD 100 Załadowanie zawartości akumulatora do komórki pamięci\nwskazanej przez argument\nStrona 2\nNazwa Kod Działanie\nSOB 101 Ustalenie, że kolejnym wykonywanym rozkazem będzie ten,\nktóry znajduje się w komórce pamięci wskazanej przez argument\n(tzw. skok bezwarunkowy)\nSOM 110 Jeżeli w akumulatorze jest liczba ujemna, jako następny będzie\nwykonywany rozkaz umieszczony w komórce pamięci wskazanej\nprzez argument. Jeżeli w akumulatorze jest liczba nieujemna, jako\nnastępny zostanie wykonany rozkaz umieszczony w pamięci\nbezpośrednio za rozkazem SOM\nSOZ 111 Skok pod adres wskazany argumentem wykonywany tylko, gdy\nw akumulatorze jest 0. W przeciwnym razie jako następny\nzostanie wykonany rozkaz umieszczony w pamięci bezpośrednio\nza rozkazem SOZ\nAby napisać program w języku asemblera maszyny W trzeba najpierw stworzyć algorytm rozwiązujący\nokreślone zadanie, sprecyzować go z wykorzystaniem wyłącznie dostępnych rozkazów i wreszcie\nzapisać go w formie linii programu w języku asemblera.\n",
5
+ "source": "Asembler.pdf",
6
+ "tags": ["działanie maszyny W", "programowanie", "język asemblera"],
7
+ "type": "explanation"
8
+ },
9
+ "system_prompt": {
10
+ "content": "Jesteś ekspertem w dziedzinie Języka Maszyny W mówiący językiem Polskim.\nOdpowiadaj zwięźle i na temat.\nTwoim zadaniem jest pomoc w napisaniu programu w tym języku, bądź wyjaśnienie jego działania.\nWszystkie informacje, które posiadasz są zawarte w kontekście, aczkolwiek jeśli uznasz, że warto coś wspomnieć, to zrób to. Staraj się być zwięzły i rzeczowy.\n\nJeśli użytkownik zada pytanie, które nie jest związane z Językiem Maszyny W, poinformuj go o tym i poproś o zadanie pytania dotyczącego tego języka.\n\nPamiętaj, że Twoim celem jest pomoc użytkownikowi w zrozumieniu działania Języka Maszyny W oraz w pisaniu programów w tym języku.",
11
+ "source": "system_prompt",
12
+ "tags": ["system prompt", "expert"],
13
+ "type": "system"
14
+ }
15
+ }
16
+ ]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ google-generativeai
2
+ sentence-transformers
3
+ faiss-cpu
4
+ numpy
5
+ fastapi
6
+ uvicorn
7
+ sqlalchemy
8
+ pydantic
9
+ PyMuPDF
settings.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+
4
+
5
+ class Chunk(BaseModel):
6
+ content: str
7
+ source: str
8
+ tags: List[str] = []
9
+ type: str = "prg"
10
+
11
+
12
+ class Settings:
13
+ response_schema = {
14
+ "type": "array",
15
+ "items": {
16
+ "type": "object",
17
+ "properties": {
18
+ "content": {
19
+ "type": "string",
20
+ "description": "Text of the chunk (100-500 words, do not cut mid-sentence/paragraph/table)",
21
+ },
22
+ "source": {"type": "string", "description": "PDF filename"},
23
+ "tags": {"type": "array", "items": {"type": "string"}},
24
+ "type": {
25
+ "type": "string",
26
+ "description": "Chunk type",
27
+ "default": "prg",
28
+ },
29
+ },
30
+ "required": ["content", "source", "tags", "type"],
31
+ },
32
+ }