File size: 2,769 Bytes
39af4d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import os
from pathlib import Path
from typing import List
def read_markdown_files(docs_path: Path) -> List[str]:
"""Reads all markdown files from the specified path and returns their content."""
markdown_content = []
for md_file in docs_path.glob("*.md"):
with open(md_file, "r", encoding="utf-8") as f:
markdown_content.append(f.read())
return markdown_content
def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
"""Chunks the given text into smaller pieces."""
# This is a very basic chunking implementation.
# A more sophisticated approach would consider sentence boundaries,
# Docusaurus-specific markdown structure, etc.
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - chunk_overlap):
chunk = " ".join(words[i:i + chunk_size])
if chunk:
chunks.append(chunk)
return chunks
def generate_embeddings(text_chunks: List[str]) -> List[List[float]]:
"""Generates embeddings for the given text chunks. Placeholder for Gemini model integration."""
print("Generating embeddings (placeholder)...")
# In a real implementation, this would call a Gemini model to get embeddings
# For now, return dummy embeddings
return [[0.1] * 768 for _ in text_chunks] # Assuming a 768-dimensional embedding for now
def store_in_qdrant(embeddings: List[List[float]], metadatas: List[dict]):
"""Stores embeddings and metadata in Qdrant. Placeholder for Qdrant client integration."""
print(f"Storing {len(embeddings)} embeddings in Qdrant (placeholder)...")
# In a real implementation, this would interact with Qdrant client
pass
def process_content(docs_root: Path):
"""Orchestrates the content processing pipeline."""
print(f"Processing markdown files from: {docs_root}")
all_markdown_text = read_markdown_files(docs_root)
all_chunks = []
# Simple metadata for now, can be expanded to include source file, chapter, etc.
all_metadatas = []
for i, text in enumerate(all_markdown_text):
chunks = chunk_text(text)
all_chunks.extend(chunks)
all_metadatas.extend([{"source": f"chapter_{i+1}"}] * len(chunks)) # Example metadata
if not all_chunks:
print("No content to process.")
return
embeddings = generate_embeddings(all_chunks)
store_in_qdrant(embeddings, all_metadatas)
print("Content processing complete.")
if __name__ == "__main__":
# Example usage:
# Assuming the script is run from the project root (D:\hackathon_01)
# and docs are in frontend/docs
frontend_docs_path = Path(__file__).parent.parent.parent.parent / "frontend" / "docs"
process_content(frontend_docs_path)
|