Spaces:

marcsixtysix
/

rag_chat_

Sleeping

App Files Files Community

mryt66 commited on Sep 11, 2025

Commit

a840639

0 Parent(s):

Initial commit

Browse files

Files changed (11) hide show

.gitattributes +37 -0
.gitignore +0 -0
Dockerfile +24 -0
README.md +10 -0
api.py +379 -0
docker-compose.yml +32 -0
generate_rag_data.py +223 -0
output_chunks.jsonl +0 -0
rag_prompt_config.jsonl +16 -0
requirements.txt +9 -0
settings.py +32 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.index filter=lfs diff=lfs merge=lfs -text
+*.db filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

Binary file (287 Bytes). View file

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.12-slim
+ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 PORT=7860 HF_HOME=/root/.cache/huggingface
+WORKDIR /app
+# faiss / numpy performance dep
+RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Optional: prefetch embedding model to reduce first-request latency
+RUN python - <<'PY' || true
+from sentence_transformers import SentenceTransformer
+SentenceTransformer('Qwen/Qwen3-Embedding-0.6B')
+PY
+COPY . .
+EXPOSE 7860
+# Start only the API
+CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: 'Rag Chat '
+emoji: 🐠
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

api.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import os
+import json
+import google.generativeai as genai
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from datetime import datetime
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Depends, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from sqlalchemy import Column, Integer, Text, DateTime, create_engine
+from sqlalchemy.orm import declarative_base, sessionmaker, Session
+from pydantic import BaseModel
+import uvicorn
+from starlette.concurrency import run_in_threadpool
+import subprocess, sys
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+# Always use local data directory (no env var logic)
+DATA_DIR = os.path.join(SCRIPT_DIR, "data")
+os.makedirs(DATA_DIR, exist_ok=True)
+OUTPUT_CHUNKS_FILE = os.path.join(SCRIPT_DIR, "output_chunks.jsonl")
+RAG_CONFIG_FILE = os.path.join(SCRIPT_DIR, "rag_prompt_config.jsonl")
+FAISS_INDEX_FILE = os.path.join(DATA_DIR, "faiss_index.index")
+EMBEDDINGS_FILE = os.path.join(DATA_DIR, "chunk_embeddings.npy")
+DATABASE_URL = f"sqlite:///{os.path.join(DATA_DIR, 'conversations.db')}"
+Base = declarative_base()
+engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
+SessionLocal = sessionmaker(bind=engine)
+# Database model
+class Conversation(Base):
+    __tablename__ = "conversations"
+    id = Column(Integer, primary_key=True, index=True)
+    query = Column(Text)
+    response = Column(Text)
+    context = Column(Text)
+    base_context = Column(Text)
+    system_prompt = Column(Text)
+    full_prompt = Column(Text)
+    timestamp = Column(DateTime, default=datetime.utcnow)
+# Pydantic models for API
+class ChatResponse(BaseModel):
+    response: str
+    timestamp: datetime
+class ChatRequest(BaseModel):
+    query: str
+    history: list[dict] | None = None  # optional conversation history
+# Initialize Gemini API
+# API key is configured during app startup (lifespan) to avoid import-time failures
+# Lifespan function to handle startup and shutdown
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    print("Starting RAG Chat API...")
+    print(f"SQLite DB path: {os.path.join(DATA_DIR, 'conversations.db')}")
+    # Ensure tables now that directory is confirmed writable
+    Base.metadata.create_all(bind=engine)
+    # Configure Gemini here (fail early but at startup)
+    API_KEY = os.getenv("GEMINI_API_KEY")
+    if not API_KEY:
+        raise RuntimeError("Please set GEMINI_API_KEY environment variable")
+    genai.configure(api_key=API_KEY)
+    try:
+        success, chunks_count = initialize_system()
+        if success:
+            print(f"✅ RAG system initialized successfully with {chunks_count} chunks")
+            print("API ready at: http://localhost:8000")
+            print("API documentation at: http://localhost:8000/docs")
+        else:
+            print("❌ Failed to initialize RAG system")
+            raise RuntimeError("System initialization failed")
+    except Exception as e:
+        print(f"❌ Initialization error: {str(e)}")
+        raise RuntimeError(f"System initialization failed: {str(e)}")
+    yield  # This is where the app runs
+    # Shutdown (if needed)
+    print("Shutting down RAG Chat API...")
+# Initialize FastAPI app with lifespan
+app = FastAPI(
+    title="RAG Chat API",
+    description="RAG Chat System with Database Integration",
+    lifespan=lifespan,
+)
+"""API only module.
+The web chat UI has been moved to a separate app (see web_app.py). This file now
+exposes only the JSON API endpoints so it can be containerised independently or
+scaled separately from the frontend.
+"""
+# Enable CORS for local dev
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost",
+        "http://localhost:3000",
+        "http://127.0.0.1:3000",
+        "http://localhost:8001",  # web UI container
+        "http://127.0.0.1:8001",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global variables to store precomputed data
+chunks_data = None
+chunk_embeddings = None
+faiss_index = None
+base_chunk = None
+system_prompt = None
+model_embedding = None
+# Dependency to get database session
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+def load_chunks(json_file):
+    """Load chunks from JSON file"""
+    try:
+        with open(json_file, "r", encoding="utf-8") as file:
+            return json.load(file)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"File {json_file} not found!")
+    except json.JSONDecodeError:
+        raise ValueError(f"Invalid JSON in {json_file}")
+def compute_and_cache_embeddings(chunks):
+    """Compute embeddings for all chunks and cache them"""
+    global chunk_embeddings, faiss_index
+    print("Computing embeddings for all chunks...")
+    texts = [chunk["content"] for chunk in chunks]
+    # Load or compute embeddings
+    if os.path.exists(EMBEDDINGS_FILE):
+        print("Loading cached embeddings...")
+        chunk_embeddings = np.load(EMBEDDINGS_FILE)
+        if chunk_embeddings.shape[0] != len(texts):
+            print("Cached embeddings count mismatches chunks. Recomputing...")
+            chunk_embeddings = model_embedding.encode(
+                texts, convert_to_numpy=True
+            ).astype("float32")
+            np.save(EMBEDDINGS_FILE, chunk_embeddings)
+    else:
+        print("Computing new embeddings (this may take a moment)...")
+        chunk_embeddings = model_embedding.encode(texts, convert_to_numpy=True).astype(
+            "float32"
+        )
+        np.save(EMBEDDINGS_FILE, chunk_embeddings)
+        print("Embeddings cached for future use.")
+    # Normalize embeddings (for cosine similarity with IndexFlatIP)
+    faiss.normalize_L2(chunk_embeddings)
+    # Create or load FAISS index
+    embedding_dim = chunk_embeddings.shape[1]
+    if os.path.exists(FAISS_INDEX_FILE):
+        print("Loading cached FAISS index...")
+        faiss_index = faiss.read_index(FAISS_INDEX_FILE)
+        # Validate index matches embeddings
+        if getattr(faiss_index, "ntotal", 0) != chunk_embeddings.shape[0]:
+            print("FAISS index size mismatches embeddings. Rebuilding index...")
+            faiss_index = faiss.IndexFlatIP(embedding_dim)
+            faiss_index.add(chunk_embeddings)
+            faiss.write_index(faiss_index, FAISS_INDEX_FILE)
+    else:
+        print("Creating new FAISS index...")
+        faiss_index = faiss.IndexFlatIP(embedding_dim)
+        faiss_index.add(chunk_embeddings)
+        faiss.write_index(faiss_index, FAISS_INDEX_FILE)
+        print("FAISS index cached for future use.")
+def retrieve_relevant_chunks(query, top_k=3):
+    """Retrieve most relevant chunks for the query using precomputed embeddings"""
+    global chunks_data, faiss_index
+    if faiss_index is None or chunks_data is None:
+        raise RuntimeError("RAG index not initialized")
+    # Encode query
+    query_embedding = model_embedding.encode([query], convert_to_numpy=True).astype(
+        "float32"
+    )
+    faiss.normalize_L2(query_embedding)
+    top_k = min(top_k, len(chunks_data))
+    # Search in precomputed index
+    _, indices = faiss_index.search(query_embedding, top_k)
+    return [chunks_data[i] for i in indices[0]]
+def _format_history(history: list[dict] | None, max_turns: int = 6) -> str:
+    """Format recent conversation history for inclusion in the prompt."""
+    if not history:
+        return ""
+    recent = history[-max_turns:]
+    lines = []
+    for turn in recent:
+        role = turn.get("role", "user")
+        msg = (turn.get("message") or "").strip()
+        if not msg:
+            continue
+        prefix = "User" if role == "user" else "Assistant"
+        lines.append(f"{prefix}: {msg}")
+    return "\n".join(lines)
+def construct_prompt(base_chunk, system_prompt, query, history_text: str = ""):
+    """Construct the full prompt with relevant context"""
+    relevant_chunks = retrieve_relevant_chunks(query)
+    context = "\n\n".join(chunk["content"] for chunk in relevant_chunks)
+    full_prompt = (
+        f"System prompt:\n{system_prompt['content']}\n\n"
+        f"Context:\n{context}\n\n"
+        f"{base_chunk['content']}\n\n"
+    )
+    if history_text:
+        full_prompt += f"Recent conversation:\n{history_text}\n\n"
+    full_prompt += f"Query:\n{query}"
+    return full_prompt, context
+def get_answer(prompt):
+    """Get answer from Gemini API"""
+    try:
+        model = genai.GenerativeModel("gemini-2.5-flash")
+        response = model.generate_content(prompt)
+        return response.text
+    except Exception as e:
+        print(f"Error getting response from Gemini: {e}")
+        return None
+def run_generate_rag_data():
+    """Run the data generation script if available."""
+    script_path = os.path.join(SCRIPT_DIR, "generate_rag_data.py")
+    if not os.path.isfile(script_path):
+        print("generate_rag_data.py not found; skipping automatic generation.")
+        return
+    print("Running generate_rag_data.py to build RAG data...")
+    try:
+        subprocess.run([sys.executable, script_path], cwd=SCRIPT_DIR, check=True)
+        print("generate_rag_data.py completed.")
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"generate_rag_data.py failed (exit {e.returncode})") from e
+def initialize_system():
+    """Initialize the RAG system with precomputed embeddings"""
+    global chunks_data, base_chunk, system_prompt, model_embedding
+    try:
+        # If embeddings or required JSON files are missing, (re)generate data first.
+        need_generation = (
+            not os.path.exists(EMBEDDINGS_FILE)
+            or not os.path.exists(OUTPUT_CHUNKS_FILE)
+            or not os.path.exists(RAG_CONFIG_FILE)
+        )
+        if need_generation:
+            print("RAG data or embeddings missing. Triggering data generation...")
+            run_generate_rag_data()
+        # Initialize embedding model
+        print("Loading embedding model...")
+        model_embedding = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
+        # Load configurations
+        print("Loading chunks and configuration...")
+        chunks_data = load_chunks(OUTPUT_CHUNKS_FILE)
+        config = load_chunks(RAG_CONFIG_FILE)[0]
+        base_chunk = config["base_chunk"]
+        system_prompt = config["system_prompt"]
+        print(f"Loaded {len(chunks_data)} chunks from knowledge base")
+        # Precompute embeddings once (will compute if file absent)
+        compute_and_cache_embeddings(chunks_data)
+        print("System initialized successfully!")
+        return True, len(chunks_data)
+    except Exception as e:
+        print(f"Failed to initialize system: {e}")
+        return False, 0
+@app.post("/chat", response_model=ChatResponse)
+async def chat_endpoint(payload: ChatRequest, db: Session = Depends(get_db)):
+    """Chat endpoint that processes queries and saves conversations to database
+    Accepts a JSON body: {"query": "..."
+    """
+    global base_chunk, system_prompt
+    query = (payload.query or "").strip()
+    if not query:
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    try:
+        # Construct prompt and get answer
+        history_text = _format_history(payload.history)
+        full_prompt, context = construct_prompt(
+            base_chunk, system_prompt, query, history_text
+        )
+        # Avoid blocking the event loop with a sync network call
+        answer = await run_in_threadpool(get_answer, full_prompt)
+        if not answer:
+            answer = "Sorry, I failed to get a response from Gemini. Please try again."
+        # Save conversation to database
+        conversation = Conversation(
+            query=query,
+            response=answer,
+            context=context,
+            base_context=base_chunk["content"],
+            system_prompt=system_prompt["content"],
+            full_prompt=full_prompt,
+        )
+        db.add(conversation)
+        db.commit()
+        return ChatResponse(response=answer, timestamp=conversation.timestamp)
+    except Exception as e:
+        db.rollback()
+        raise HTTPException(status_code=500, detail=f"Chat processing error: {str(e)}")
+# Simple health probe
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+if __name__ == "__main__":
+    import os
+    # Check environment variable
+    if not os.getenv("GEMINI_API_KEY"):
+        print("Warning: GEMINI_API_KEY environment variable not set!")
+        print("Please set it with: set GEMINI_API_KEY=your_api_key_here")
+        exit(1)
+    print("Starting RAG Chat API server...")
+    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True, log_level="info")

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,32 @@

+version: '3.8'
+services:
+  rag-api:
+    build:
+      context: .
+      args:
+        SERVICE_TARGET: api
+    image: rag-chat-api:latest
+    container_name: rag-chat-api
+    environment:
+      - GEMINI_API_KEY=${GEMINI_API_KEY}
+    volumes:
+      - ./data:/app/data
+      - ./output_chunks.jsonl:/app/output_chunks.jsonl:ro
+      - ./rag_prompt_config.jsonl:/app/rag_prompt_config.jsonl:ro
+    ports:
+      - "8000:8000"
+    restart: unless-stopped
+  rag-web:
+    build:
+      context: .
+      args:
+        SERVICE_TARGET: web
+    image: rag-chat-web:latest
+    container_name: rag-chat-web
+    environment:
+      - API_BASE=http://rag-api:8000
+    depends_on:
+      - rag-api
+    ports:
+      - "8001:8001"
+    restart: unless-stopped

generate_rag_data.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from google import genai
+from typing import List
+from pathlib import Path
+import fitz
+import json
+import os
+import textwrap
+from settings import Chunk, Settings
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(SCRIPT_DIR, "data")
+os.makedirs(DATA_DIR, exist_ok=True)
+# Input: put your raw source files (txt/markdown) inside ./data/source
+SOURCE_DIR = os.path.join(DATA_DIR, "source")
+os.makedirs(SOURCE_DIR, exist_ok=True)
+# Output artifact locations (align with api.py expectations)
+OUTPUT_CHUNKS_FILE = os.path.join(
+    SCRIPT_DIR, "output_chunks.jsonl"
+)  # already used in api.py
+RAG_CONFIG_FILE = os.path.join(
+    SCRIPT_DIR, "rag_prompt_config.jsonl"
+)  # already used in api.py
+# If you also want these in data/ instead, uncomment:
+# OUTPUT_CHUNKS_FILE = os.path.join(DATA_DIR, "output_chunks.jsonl")
+# RAG_CONFIG_FILE = os.path.join(DATA_DIR, "rag_prompt_config.jsonl")
+# Example system / base prompts (edit as needed)
+SYSTEM_PROMPT = {
+    "role": "system",
+    "content": "You are a helpful RAG assistant. Use only the provided context. If unsure, say you don't know.",
+}
+BASE_CHUNK = {
+    "role": "base",
+    "content": "Answer the user's query using only the contextual chunks below.",
+}
+def extract_pdf_text(filename: str) -> str:
+    text = ""
+    with fitz.open(filename) as doc:
+        for page in doc:
+            text += page.get_text()
+    return text
+def chunk_pdf(filename: str) -> List[Chunk]:
+    client = genai.Client()
+    text = extract_pdf_text(filename)
+    # print(text)
+    pdf_name = Path(filename).name
+    prompt = f"""
+    Split the following text into coherent chunks suitable for RAG.
+    Each chunk should be 100-500 words.
+    Do not cut mid-sentence, paragraph, or table.
+    Preserve headings, bullet points, and tables.
+    Return an array of JSON objects with this structure:
+    {{
+        "content": "<chunk text>",
+        "source": "{pdf_name}",
+        "tags": [],
+        "type": "prg"
+    }}
+    Text:
+    {text}
+    """
+    client = genai.Client()
+    response = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=prompt,
+        config={
+            "response_mime_type": "application/json",
+            "response_schema": Settings.response_schema,
+        },
+    )
+    chunks: List[Chunk] = response.parsed
+    return chunks
+def process_pdf_folder(folder_path):
+    folder = Path(folder_path)
+    pdfs = list(folder.glob("*.pdf"))
+    all_chunks = []
+    if not pdfs:
+        print(f"No PDF files found in {folder_path}")
+        return []
+    else:
+        pdfs.sort(key=lambda x: x.name)
+        for pdf_file in pdfs:
+            print(f"Processing PDF: {pdf_file.name}")
+            chunks = chunk_pdf(filename=pdf_file)
+            all_chunks.extend(chunks)
+    return all_chunks
+def make_prg_chunk(text, filename):
+    return [
+        {
+            "content": text.strip(),
+            "source": Path(filename).name,
+            "tags": [],
+            "type": "prg",
+        }
+    ]
+def process_prg_folder(folder_path):
+    folder = Path(folder_path)
+    all_chunks = []
+    prgs = list(folder.glob("*.prg"))
+    if not prgs:
+        print(f"No .prg files found in {folder_path}")
+        return []
+    prgs.sort(key=lambda x: x.name)
+    for prg_file in prgs:
+        print(f"Processing PRG: {prg_file.name}")
+        text = prg_file.read_text(encoding="utf-8", errors="ignore")
+        chunk = make_prg_chunk(text, prg_file.name)
+        all_chunks.extend(chunk)
+    return all_chunks
+def read_source_files():
+    """Load all .txt / .md files from SOURCE_DIR."""
+    files = []
+    for name in os.listdir(SOURCE_DIR):
+        if name.lower().endswith((".txt", ".md")):
+            path = os.path.join(SOURCE_DIR, name)
+            with open(path, "r", encoding="utf-8") as f:
+                files.append((name, f.read()))
+    if not files:
+        # Provide a fallback demo file if none exist
+        demo_path = os.path.join(SOURCE_DIR, "demo.txt")
+        demo_text = (
+            "This is a demo knowledge file.\n"
+            "Add your project or domain documentation as .txt or .md files here."
+        )
+        with open(demo_path, "w", encoding="utf-8") as f:
+            f.write(demo_text)
+        files.append(("demo.txt", demo_text))
+    return files
+def chunk_text(text: str, max_chars: int = 1200, overlap: int = 150):
+    """Simple character-based chunking with overlap."""
+    text = text.strip()
+    if not text:
+        return []
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = min(len(text), start + max_chars)
+        chunk = text[start:end]
+        chunks.append(chunk.strip())
+        if end >= len(text):
+            break
+        start = end - overlap
+        if start < 0:
+            start = 0
+    return chunks
+def build_chunks():
+    """Create chunk objects suitable for embedding."""
+    all_files = read_source_files()
+    chunks = []
+    idx = 0
+    for filename, content in all_files:
+        parts = chunk_text(content)
+        for part in parts:
+            chunks.append({"id": idx, "source": filename, "content": part})
+            idx += 1
+    return chunks
+def write_jsonl(path: str, records):
+    with open(path, "w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def write_config(path: str):
+    """Write system + base prompt config file (list with single object)."""
+    obj = [{"system_prompt": SYSTEM_PROMPT, "base_chunk": BASE_CHUNK}]
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)
+def main():
+    pdf_folder = r"C:\Users\kogut\Python\Assembler_rag\data\pdfs"
+    prg_folder = r"C:\Users\kogut\Python\Assembler_rag\data\prg"
+    # pdf_folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./data/pdfs")
+    # prg_folder = Path(sys.argv[2]) if len(sys.argv) > 2 else None
+    output_jsonl = "output_chunks.jsonl"
+    all_chunks = process_pdf_folder(pdf_folder)
+    if prg_folder:
+        all_chunks += process_prg_folder(prg_folder)
+    with open(output_jsonl, "w", encoding="utf-8") as f:
+        json.dump(all_chunks, f, ensure_ascii=False, indent=2)
+    print(f"Finished. {len(all_chunks)} total chunks written to {output_jsonl}")
+    print(f"Generating RAG data from: {SOURCE_DIR}")
+    chunks = build_chunks()
+    print(f"Built {len(chunks)} chunks")
+    write_jsonl(OUTPUT_CHUNKS_FILE, chunks)
+    write_config(RAG_CONFIG_FILE)
+    print(f"Wrote chunks to: {OUTPUT_CHUNKS_FILE}")
+    print(f"Wrote config to: {RAG_CONFIG_FILE}")
+    print("Done.")
+if __name__ == "__main__":
+    main()

output_chunks.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

rag_prompt_config.jsonl ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "base_chunk": {
+            "content": "Pisząc program w języku asemblera mMszyny W posługujemy się niemal wyłącznie instrukcjami\nodpowiadającymi rozkazom danego procesora. Dlatego konieczne jest poznanie tych rozkazów,\npodobnie jak zaznajomienie się z architekturą tegoż procesora. Program w języku asemblera tworzą\nrozkazy oraz dane, na których te rozkazy działają. Program składa się z kolejnych linii, w każdej\nlinii może znaleźć się jeden rozkaz lub deklaracja pojedynczej danej. Formalnie składnia linii\nprogramu jest następująca:\n[<etykieta>:] <rozkaz lub pseudorozkaz> [<argument>]\ngdzie:\n<etykieta> – ciąg liter i cyfr będący symboliczną reprezentacją określonego adresu\n<rozkaz lub pseudorozkaz> – symboliczna nazwa jednego z rozkazów procesora\nlub jednego z tzw. pseudorozkazów rezerwacji miejsca w pamięci na dane (RST,\nRPA)\n<argument> – liczba dziesiętna lub jedna z etykiet wprowadzonych na początku linii\nPseudorozkazy RST i RPA pozwalają odpowiednio zarezerwować miejsce w pamięci na\npojedynczą daną o ustalonej (jako argument) wartości początkowej oraz zarezerwować miejsce\nw pamięci na daną bez wskazywania jej wartości początkowej.(aby zarezerwować miejsce na daną możemy zrobić <etykieta> RST <wartość> lub <etykieta> RPA). Jako rozkaz może pojawić się\nnazwa jednego z dostępnych rozkazów. Przyjmiemy, że w procesorze maszyny W dostępnych jest 8\nrozkazów wymienionych w poniższej tabeli.\nNazwa Kod Działanie\nSTP 000 Zatrzymanie (zakończenie) pracy programu\nDOD 001 Dodanie do akumulatora zawartości komórki pamięci wskazanej\nprzez argument\nODE 010 Odjęcie od akumulatora zawartości komórki pamięci wskazanej\nprzez argument\nPOB 011 Pobranie do akumulatora zawartości komórki pamięci wskazanej\nprzez argument\nŁAD 100 Załadowanie   zawartości   akumulatora   do   komórki   pamięci\nwskazanej przez argument\nStrona 2\nNazwa Kod Działanie\nSOB 101 Ustalenie, że kolejnym wykonywanym rozkazem będzie ten,\nktóry znajduje się w komórce pamięci wskazanej przez argument\n(tzw. skok bezwarunkowy)\nSOM 110 Jeżeli w akumulatorze jest liczba ujemna, jako następny będzie\nwykonywany rozkaz umieszczony w komórce pamięci wskazanej\nprzez argument. Jeżeli w akumulatorze jest liczba nieujemna, jako\nnastępny zostanie wykonany rozkaz umieszczony w pamięci\nbezpośrednio za rozkazem SOM\nSOZ 111 Skok pod adres wskazany argumentem wykonywany tylko, gdy\nw akumulatorze jest 0. W przeciwnym razie jako następny\nzostanie wykonany rozkaz umieszczony w pamięci bezpośrednio\nza rozkazem SOZ\nAby napisać program w języku asemblera maszyny W trzeba najpierw stworzyć algorytm rozwiązujący\nokreślone zadanie, sprecyzować go z wykorzystaniem wyłącznie dostępnych rozkazów i wreszcie\nzapisać go w formie linii programu w języku asemblera.\n",
+            "source": "Asembler.pdf",
+            "tags": ["działanie maszyny W", "programowanie", "język asemblera"],
+            "type": "explanation"
+        },
+        "system_prompt": {
+            "content": "Jesteś ekspertem w dziedzinie Języka Maszyny W mówiący językiem Polskim.\nOdpowiadaj zwięźle i na temat.\nTwoim zadaniem jest pomoc w napisaniu programu w tym języku, bądź wyjaśnienie jego działania.\nWszystkie informacje, które posiadasz są zawarte w kontekście, aczkolwiek jeśli uznasz, że warto coś wspomnieć, to zrób to. Staraj się być zwięzły i rzeczowy.\n\nJeśli użytkownik zada pytanie, które nie jest związane z Językiem Maszyny W, poinformuj go o tym i poproś o zadanie pytania dotyczącego tego języka.\n\nPamiętaj, że Twoim celem jest pomoc użytkownikowi w zrozumieniu działania Języka Maszyny W oraz w pisaniu programów w tym języku.",
+            "source": "system_prompt",
+            "tags": ["system prompt", "expert"],
+            "type": "system"
+        }
+    }
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+google-generativeai
+sentence-transformers
+faiss-cpu
+numpy
+fastapi
+uvicorn
+sqlalchemy
+pydantic
+PyMuPDF

settings.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from pydantic import BaseModel
+from typing import List
+class Chunk(BaseModel):
+    content: str
+    source: str
+    tags: List[str] = []
+    type: str = "prg"
+class Settings:
+    response_schema = {
+        "type": "array",
+        "items": {
+            "type": "object",
+            "properties": {
+                "content": {
+                    "type": "string",
+                    "description": "Text of the chunk (100-500 words, do not cut mid-sentence/paragraph/table)",
+                },
+                "source": {"type": "string", "description": "PDF filename"},
+                "tags": {"type": "array", "items": {"type": "string"}},
+                "type": {
+                    "type": "string",
+                    "description": "Chunk type",
+                    "default": "prg",
+                },
+            },
+            "required": ["content", "source", "tags", "type"],
+        },
+    }