Spaces:
Runtime error
Runtime error
Commit
·
36425a4
1
Parent(s):
40303c6
'Upload'
Browse files- .gitignore +3 -0
- Dockerfile +20 -0
- app/api/query.py +16 -0
- app/api/translate.py +55 -0
- app/main.py +17 -0
- app/schemas.py +10 -0
- app/services/embedding_service.py +23 -0
- app/services/qdrant_service.py +83 -0
- app/services/rag_service.py +78 -0
- app/utils/chunking.py +52 -0
- scripts/index_chapters.py +49 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
requirements.txt
|
| 3 |
+
__pycache__/
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
FROM python:3.10
|
| 3 |
+
|
| 4 |
+
# Pehle root user hai by default, to yahan packages install karo
|
| 5 |
+
RUN apt-get update && apt-get install -y libgl1 libglib2.0-0
|
| 6 |
+
|
| 7 |
+
# Ab user add karo aur switch karo
|
| 8 |
+
RUN useradd -m -u 1000 user
|
| 9 |
+
USER user
|
| 10 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 11 |
+
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 16 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
COPY --chown=user . /app
|
| 20 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/api/query.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException
|
| 2 |
+
from ..schemas import QueryRequest, QueryResponse
|
| 3 |
+
from ..services.rag_service import get_rag_response
|
| 4 |
+
|
| 5 |
+
router = APIRouter()
|
| 6 |
+
|
| 7 |
+
@router.post("/query", response_model=QueryResponse)
|
| 8 |
+
async def query_chatbot(request: QueryRequest):
|
| 9 |
+
try:
|
| 10 |
+
response_text, sources = await get_rag_response(request.message, request.selected_text)
|
| 11 |
+
return QueryResponse(response=response_text, sources=sources)
|
| 12 |
+
except Exception as e:
|
| 13 |
+
# Log the full error to the backend console for debugging
|
| 14 |
+
print(f"An unexpected error occurred: {e}")
|
| 15 |
+
# Return a generic but informative error to the frontend
|
| 16 |
+
raise HTTPException(status_code=500, detail=f"An internal error occurred in the backend: {e}")
|
app/api/translate.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
import os
|
| 4 |
+
from google.generativeai import GenerativeModel
|
| 5 |
+
|
| 6 |
+
# Define the request and response models for the translation endpoint
|
| 7 |
+
class TranslationRequest(BaseModel):
|
| 8 |
+
text: str
|
| 9 |
+
target_language: str = "Urdu" # Default to Urdu as per the feature request
|
| 10 |
+
|
| 11 |
+
class TranslationResponse(BaseModel):
|
| 12 |
+
translated_text: str
|
| 13 |
+
|
| 14 |
+
# Initialize the router for the translation API
|
| 15 |
+
router = APIRouter()
|
| 16 |
+
|
| 17 |
+
# Configure Gemini API (ensure GEMINI_API_KEY is set in your environment)
|
| 18 |
+
try:
|
| 19 |
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
| 20 |
+
if not gemini_api_key:
|
| 21 |
+
raise ValueError("GEMINI_API_KEY environment variable not set for Python backend.")
|
| 22 |
+
# No direct 'configure' needed for this library version, key is passed to model
|
| 23 |
+
except ValueError as e:
|
| 24 |
+
print(f"Configuration Error: {e}")
|
| 25 |
+
# This will cause an error when the model is initialized if the key is missing
|
| 26 |
+
|
| 27 |
+
# Define the model to be used for translation
|
| 28 |
+
GEMINI_TRANSLATION_MODEL = 'gemini-2.5-flash' # gemini-pro is excellent for translation tasks
|
| 29 |
+
|
| 30 |
+
@router.post("/translate", response_model=TranslationResponse)
|
| 31 |
+
async def translate_text(request: TranslationRequest):
|
| 32 |
+
"""
|
| 33 |
+
Translates the given text to the target language using the Gemini API.
|
| 34 |
+
"""
|
| 35 |
+
if not gemini_api_key:
|
| 36 |
+
raise HTTPException(status_code=500, detail="Server is not configured for translation (missing API key).")
|
| 37 |
+
|
| 38 |
+
model = GenerativeModel(GEMINI_TRANSLATION_MODEL)
|
| 39 |
+
|
| 40 |
+
# Construct a clear and direct prompt for the translation task
|
| 41 |
+
prompt = f"Translate the following text to {request.target_language}:\n\n---\n{request.text}\n---"
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
print(f"Sending translation request to Gemini for target language: {request.target_language}")
|
| 45 |
+
response = await model.generate_content_async(prompt)
|
| 46 |
+
|
| 47 |
+
# Extract the translated text from the response
|
| 48 |
+
translated_text = response.text.strip()
|
| 49 |
+
|
| 50 |
+
print("Successfully received translation from Gemini.")
|
| 51 |
+
return TranslationResponse(translated_text=translated_text)
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"An unexpected error occurred during translation with Gemini: {e}")
|
| 55 |
+
raise HTTPException(status_code=500, detail=f"An internal error occurred during translation: {e}")
|
app/main.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from .api import query, translate # Import the new translate router
|
| 4 |
+
|
| 5 |
+
app = FastAPI()
|
| 6 |
+
|
| 7 |
+
# Add CORS middleware to allow requests from the frontend
|
| 8 |
+
app.add_middleware(
|
| 9 |
+
CORSMiddleware,
|
| 10 |
+
allow_origins=["*"], # Allows all origins
|
| 11 |
+
allow_credentials=True,
|
| 12 |
+
allow_methods=["*"], # Allows all methods
|
| 13 |
+
allow_headers=["*"], # Allows all headers
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
app.include_router(query.router, prefix="/api")
|
| 17 |
+
app.include_router(translate.router, prefix="/api") # Include the new translate router
|
app/schemas.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
|
| 4 |
+
class QueryRequest(BaseModel):
|
| 5 |
+
message: str
|
| 6 |
+
selected_text: Optional[str] = None
|
| 7 |
+
|
| 8 |
+
class QueryResponse(BaseModel):
|
| 9 |
+
response: str
|
| 10 |
+
sources: List[str]
|
app/services/embedding_service.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
class EmbeddingService:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
# Load the sentence-transformers model (all-MiniLM-L6-v2)
|
| 7 |
+
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 8 |
+
|
| 9 |
+
def encode(self, texts: List[str]) -> List[List[float]]:
|
| 10 |
+
"""
|
| 11 |
+
Encodes a list of texts into embeddings.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
texts: A list of strings to encode.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
A list of embedding vectors.
|
| 18 |
+
"""
|
| 19 |
+
if isinstance(texts, str):
|
| 20 |
+
texts = [texts] # Ensure texts is a list for batch processing
|
| 21 |
+
embeddings = self.model.encode(texts).tolist()
|
| 22 |
+
return embeddings
|
| 23 |
+
|
app/services/qdrant_service.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from qdrant_client import QdrantClient, models
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
|
| 5 |
+
class QdrantService:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
"""
|
| 8 |
+
Initializes the QdrantService, setting up the client and ensuring the collection exists.
|
| 9 |
+
"""
|
| 10 |
+
self.qdrant_url = os.getenv("QDRANT_URL")
|
| 11 |
+
self.qdrant_api_key = os.getenv("QDRANT_API_KEY")
|
| 12 |
+
self.collection_name = "textbook_chunks"
|
| 13 |
+
self.vector_size = 384 # Based on the all-MiniLM-L6-v2 model
|
| 14 |
+
|
| 15 |
+
if not self.qdrant_url:
|
| 16 |
+
raise ValueError("QDRANT_URL must be set in environment variables.")
|
| 17 |
+
|
| 18 |
+
# The QdrantClient can be initialized with or without an API key.
|
| 19 |
+
# If the key is None, it will connect to a local or unsecured Qdrant instance.
|
| 20 |
+
self.client = QdrantClient(
|
| 21 |
+
url=self.qdrant_url,
|
| 22 |
+
api_key=self.qdrant_api_key,
|
| 23 |
+
)
|
| 24 |
+
self.ensure_collection()
|
| 25 |
+
|
| 26 |
+
def ensure_collection(self):
|
| 27 |
+
"""
|
| 28 |
+
Checks if the required collection exists in Qdrant and creates it if it doesn't.
|
| 29 |
+
"""
|
| 30 |
+
try:
|
| 31 |
+
self.client.get_collection(collection_name=self.collection_name)
|
| 32 |
+
print(f"Collection '{self.collection_name}' already exists.")
|
| 33 |
+
except Exception:
|
| 34 |
+
print(f"Collection '{self.collection_name}' not found, creating it...")
|
| 35 |
+
self.client.recreate_collection(
|
| 36 |
+
collection_name=self.collection_name,
|
| 37 |
+
vectors_config=models.VectorParams(size=self.vector_size, distance=models.Distance.COSINE),
|
| 38 |
+
)
|
| 39 |
+
print(f"Collection '{self.collection_name}' created successfully.")
|
| 40 |
+
|
| 41 |
+
def upsert_chunks(self, ids: List[str], vectors: List[List[float]], payloads: List[Dict[str, Any]]):
|
| 42 |
+
if not (len(ids) == len(vectors) == len(payloads)):
|
| 43 |
+
raise ValueError("ids, vectors, and payloads must have the same length")
|
| 44 |
+
|
| 45 |
+
points = [
|
| 46 |
+
models.PointStruct(id=id_, vector=vector, payload=payload)
|
| 47 |
+
for id_, vector, payload in zip(ids, vectors, payloads)
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
self.client.upsert(
|
| 51 |
+
collection_name=self.collection_name,
|
| 52 |
+
points=points,
|
| 53 |
+
wait=True
|
| 54 |
+
)
|
| 55 |
+
print(f"Upserted {len(points)} chunks successfully.")
|
| 56 |
+
|
| 57 |
+
def search(self, query_vector: List[float], limit: int = 5) -> List[Dict[str, Any]]:
|
| 58 |
+
"""
|
| 59 |
+
Performs a vector search in the Qdrant collection.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
query_vector: The vector representation of the query.
|
| 63 |
+
limit: The maximum number of results to return.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
A list of search results, each containing the payload and score.
|
| 67 |
+
"""
|
| 68 |
+
print(f"Searching Qdrant with a vector...")
|
| 69 |
+
search_results = self.client.search(
|
| 70 |
+
collection_name=self.collection_name,
|
| 71 |
+
query_vector=query_vector,
|
| 72 |
+
limit=limit,
|
| 73 |
+
with_payload=True, # Ensure the payload is returned with the results
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# The modern client returns ScoredPoint objects
|
| 77 |
+
results = [
|
| 78 |
+
{"payload": hit.payload, "score": hit.score}
|
| 79 |
+
for hit in search_results
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
print(f"Qdrant search completed. Found {len(results)} results.")
|
| 83 |
+
return results
|
app/services/rag_service.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from google.generativeai import GenerativeModel, configure
|
| 3 |
+
from .embedding_service import EmbeddingService
|
| 4 |
+
from .qdrant_service import QdrantService
|
| 5 |
+
|
| 6 |
+
embedding_service = EmbeddingService()
|
| 7 |
+
qdrant_service = QdrantService()
|
| 8 |
+
|
| 9 |
+
# Configure Gemini API
|
| 10 |
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
| 11 |
+
if not gemini_api_key:
|
| 12 |
+
raise ValueError("GEMINI_API_KEY environment variable not set for Python backend.")
|
| 13 |
+
configure(api_key=gemini_api_key)
|
| 14 |
+
GEMINI_GENERATIVE_MODEL = 'gemini-2.5-flash'
|
| 15 |
+
|
| 16 |
+
async def get_rag_response(message: str, selected_text: str | None = None):
|
| 17 |
+
# The user's question is the primary query for search
|
| 18 |
+
query_text = message
|
| 19 |
+
|
| 20 |
+
# If text is selected, we use that to refine the search query
|
| 21 |
+
if selected_text:
|
| 22 |
+
query_text = f"User's question: '{message}' --- Context from selected text: '{selected_text}'"
|
| 23 |
+
|
| 24 |
+
query_embedding = embedding_service.encode([query_text])[0]
|
| 25 |
+
search_results = qdrant_service.search(query_embedding)
|
| 26 |
+
|
| 27 |
+
# --- ENHANCED PROMPT CONSTRUCTION ---
|
| 28 |
+
prompt_parts = [
|
| 29 |
+
"You are a helpful expert assistant for the \"Physical AI & Humanoid Robotics\" textbook.",
|
| 30 |
+
"Your task is to answer the user's question. Use the provided context to form your answer.",
|
| 31 |
+
"Be concise, professional, and helpful. If the context does not contain the answer, say that you couldn't find the specific information in the provided materials, but try to answer based on your general knowledge of the topic if appropriate.",
|
| 32 |
+
"\n--- CONTEXT ---"
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# **CRITICAL FIX**: Prioritize the user's selected text by adding it to the context first.
|
| 36 |
+
if selected_text:
|
| 37 |
+
prompt_parts.append("\n**User-Selected Text:**\n")
|
| 38 |
+
prompt_parts.append(selected_text)
|
| 39 |
+
prompt_parts.append("\n---")
|
| 40 |
+
|
| 41 |
+
# Add supplementary context from the vector search
|
| 42 |
+
context_chunks = [
|
| 43 |
+
hit["payload"].get("content")
|
| 44 |
+
for hit in search_results
|
| 45 |
+
if hit.get("payload") and hit["payload"].get("content")
|
| 46 |
+
]
|
| 47 |
+
if context_chunks:
|
| 48 |
+
prompt_parts.append("\n**Relevant Excerpts from the Book:**\n")
|
| 49 |
+
prompt_parts.append("\n---\n".join(context_chunks))
|
| 50 |
+
|
| 51 |
+
# Handle the case where no context is found at all
|
| 52 |
+
if not selected_text and not context_chunks:
|
| 53 |
+
prompt_parts.append("\nNo specific context was provided or found.")
|
| 54 |
+
|
| 55 |
+
prompt_parts.append("\n--- END OF CONTEXT ---\n")
|
| 56 |
+
prompt_parts.append(f"**User's Question:**\n{message}\n\n**Answer:**\n")
|
| 57 |
+
|
| 58 |
+
full_prompt = "\n".join(prompt_parts)
|
| 59 |
+
|
| 60 |
+
# Make LLM call to Gemini
|
| 61 |
+
model = GenerativeModel(GEMINI_GENERATIVE_MODEL)
|
| 62 |
+
try:
|
| 63 |
+
response = await model.generate_content_async(full_prompt)
|
| 64 |
+
response_text = response.text
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Error calling Gemini API: {e}")
|
| 67 |
+
response_text = "An error occurred while generating the response."
|
| 68 |
+
|
| 69 |
+
# Format sources for the frontend
|
| 70 |
+
sources = [
|
| 71 |
+
hit["payload"].get("source")
|
| 72 |
+
for hit in search_results
|
| 73 |
+
if hit.get("payload") and hit["payload"].get("source")
|
| 74 |
+
]
|
| 75 |
+
unique_sources = list(set(sources))
|
| 76 |
+
formatted_sources = [f"{source}" for source in unique_sources]
|
| 77 |
+
|
| 78 |
+
return response_text, formatted_sources
|
app/utils/chunking.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import markdown
|
| 2 |
+
import re
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
|
| 5 |
+
def semantic_chunking(markdown_content: str, min_tokens: int = 100, max_tokens: int = 512, overlap_tokens: int = 50) -> List[Dict[str, Any]]:
|
| 6 |
+
"""
|
| 7 |
+
Splits markdown content into semantic chunks, respecting heading boundaries.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
markdown_content: The full markdown content of a document.
|
| 11 |
+
min_tokens: Minimum token count for a chunk.
|
| 12 |
+
max_tokens: Maximum token count for a chunk.
|
| 13 |
+
overlap_tokens: Number of tokens to overlap between chunks.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
A list of dictionaries, where each dictionary represents a chunk
|
| 17 |
+
and contains 'content' and 'metadata' (e.g., 'source', 'heading').
|
| 18 |
+
"""
|
| 19 |
+
chunks = []
|
| 20 |
+
# This is a placeholder. A real implementation would involve tokenization
|
| 21 |
+
# and more sophisticated logic to respect headings and token limits.
|
| 22 |
+
# For now, we'll split by paragraphs as a simple approximation.
|
| 23 |
+
paragraphs = markdown_content.split('\n\n')
|
| 24 |
+
|
| 25 |
+
current_chunk_content = ""
|
| 26 |
+
current_heading = "Introduction" # Default heading
|
| 27 |
+
|
| 28 |
+
for paragraph in paragraphs:
|
| 29 |
+
if paragraph.strip().startswith('#'): # Detect a new heading
|
| 30 |
+
current_heading = paragraph.strip().lstrip('# ').strip()
|
| 31 |
+
|
| 32 |
+
# Simple token count approximation (word count)
|
| 33 |
+
paragraph_tokens = len(paragraph.split())
|
| 34 |
+
|
| 35 |
+
if len(current_chunk_content.split()) + paragraph_tokens > max_tokens:
|
| 36 |
+
if current_chunk_content.strip():
|
| 37 |
+
chunks.append({
|
| 38 |
+
"content": current_chunk_content.strip(),
|
| 39 |
+
"metadata": {"source": "unknown", "heading": current_heading}
|
| 40 |
+
})
|
| 41 |
+
current_chunk_content = paragraph
|
| 42 |
+
else:
|
| 43 |
+
current_chunk_content += "\n\n" + paragraph
|
| 44 |
+
|
| 45 |
+
if current_chunk_content.strip():
|
| 46 |
+
chunks.append({
|
| 47 |
+
"content": current_chunk_content.strip(),
|
| 48 |
+
"metadata": {"source": "unknown", "heading": current_heading}
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
return chunks
|
| 52 |
+
|
scripts/index_chapters.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
import uuid
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
from backend.app.utils.chunking import semantic_chunking
|
| 7 |
+
from backend.app.services.embedding_service import EmbeddingService
|
| 8 |
+
from backend.app.services.qdrant_service import QdrantService
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
def index_chapters():
|
| 13 |
+
print("Starting chapter indexing...")
|
| 14 |
+
|
| 15 |
+
# Initialize services (assuming they can be initialized without FastAPI app context for script use)
|
| 16 |
+
embedding_service = EmbeddingService() # Assuming default constructor
|
| 17 |
+
qdrant_service = QdrantService() # Assuming default constructor
|
| 18 |
+
|
| 19 |
+
chapter_files = glob.glob("frontend/docs/chapter-*.md")
|
| 20 |
+
if not chapter_files:
|
| 21 |
+
print("No chapter files found in website/docs/. Please ensure chapters exist.")
|
| 22 |
+
return
|
| 23 |
+
|
| 24 |
+
for file_path in chapter_files:
|
| 25 |
+
print(f"Processing {file_path}...")
|
| 26 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 27 |
+
markdown_content = f.read()
|
| 28 |
+
|
| 29 |
+
chunks = semantic_chunking(markdown_content)
|
| 30 |
+
|
| 31 |
+
for i, chunk in enumerate(chunks):
|
| 32 |
+
chunk_content = chunk["content"]
|
| 33 |
+
metadata = chunk["metadata"]
|
| 34 |
+
metadata["source"] = file_path # Update source to actual file path
|
| 35 |
+
metadata["chunk_number"] = i
|
| 36 |
+
|
| 37 |
+
# Generate a unique UUID for the Qdrant point
|
| 38 |
+
point_id = str(uuid.uuid4())
|
| 39 |
+
|
| 40 |
+
# Generate embedding
|
| 41 |
+
embedding = embedding_service.encode([chunk_content])[0] # Pass as list and take first element
|
| 42 |
+
|
| 43 |
+
# Store in Qdrant
|
| 44 |
+
qdrant_service.upsert_chunks(ids=[point_id], vectors=[embedding], payloads=[metadata])
|
| 45 |
+
|
| 46 |
+
print("Chapter indexing completed.")
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
index_chapters()
|