| import os |
| from pathlib import Path |
| from typing import List |
|
|
| def read_markdown_files(docs_path: Path) -> List[str]: |
| """Reads all markdown files from the specified path and returns their content.""" |
| markdown_content = [] |
| for md_file in docs_path.glob("*.md"): |
| with open(md_file, "r", encoding="utf-8") as f: |
| markdown_content.append(f.read()) |
| return markdown_content |
|
|
| def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]: |
| """Chunks the given text into smaller pieces.""" |
| |
| |
| |
| words = text.split() |
| chunks = [] |
| for i in range(0, len(words), chunk_size - chunk_overlap): |
| chunk = " ".join(words[i:i + chunk_size]) |
| if chunk: |
| chunks.append(chunk) |
| return chunks |
|
|
| def generate_embeddings(text_chunks: List[str]) -> List[List[float]]: |
| """Generates embeddings for the given text chunks. Placeholder for Gemini model integration.""" |
| print("Generating embeddings (placeholder)...") |
| |
| |
| return [[0.1] * 768 for _ in text_chunks] |
|
|
| def store_in_qdrant(embeddings: List[List[float]], metadatas: List[dict]): |
| """Stores embeddings and metadata in Qdrant. Placeholder for Qdrant client integration.""" |
| print(f"Storing {len(embeddings)} embeddings in Qdrant (placeholder)...") |
| |
| pass |
|
|
| def process_content(docs_root: Path): |
| """Orchestrates the content processing pipeline.""" |
| print(f"Processing markdown files from: {docs_root}") |
| all_markdown_text = read_markdown_files(docs_root) |
| |
| all_chunks = [] |
| |
| all_metadatas = [] |
|
|
| for i, text in enumerate(all_markdown_text): |
| chunks = chunk_text(text) |
| all_chunks.extend(chunks) |
| all_metadatas.extend([{"source": f"chapter_{i+1}"}] * len(chunks)) |
| |
| if not all_chunks: |
| print("No content to process.") |
| return |
|
|
| embeddings = generate_embeddings(all_chunks) |
| store_in_qdrant(embeddings, all_metadatas) |
| print("Content processing complete.") |
|
|
| if __name__ == "__main__": |
| |
| |
| |
| frontend_docs_path = Path(__file__).parent.parent.parent.parent / "frontend" / "docs" |
| process_content(frontend_docs_path) |
|
|