|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
import markdown |
|
|
from sqlalchemy.orm import Session |
|
|
|
|
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..')) |
|
|
|
|
|
from backend.database import get_db, engine |
|
|
from backend.models.book_content import BookContent, Base |
|
|
from backend.ai.embeddings import qwen_embeddings |
|
|
from backend.database.vector_db import vector_db |
|
|
|
|
|
|
|
|
def create_tables(): |
|
|
"""Create database tables if they don't exist""" |
|
|
Base.metadata.create_all(bind=engine) |
|
|
|
|
|
|
|
|
def read_markdown_files(docs_dir: str): |
|
|
"""Recursively read all markdown files in the docs directory""" |
|
|
md_files = [] |
|
|
for root, dirs, files in os.walk(docs_dir): |
|
|
for file in files: |
|
|
if file.endswith('.md') or file.endswith('.mdx'): |
|
|
file_path = os.path.join(root, file) |
|
|
md_files.append(file_path) |
|
|
return md_files |
|
|
|
|
|
|
|
|
def extract_content_from_file(file_path: str): |
|
|
"""Extract content from a markdown file, removing frontmatter if present""" |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
if content.startswith('---'): |
|
|
|
|
|
parts = content.split('---', 2) |
|
|
if len(parts) >= 3: |
|
|
content = parts[2] |
|
|
|
|
|
|
|
|
|
|
|
html = markdown.markdown(content) |
|
|
|
|
|
import re |
|
|
plain_text = re.sub('<[^<]+?>', '', html) |
|
|
|
|
|
return plain_text.strip() |
|
|
|
|
|
|
|
|
def process_book_content(): |
|
|
"""Process all book content and store in database with embeddings""" |
|
|
print("Starting book content processing...") |
|
|
|
|
|
|
|
|
create_tables() |
|
|
|
|
|
|
|
|
db_gen = get_db() |
|
|
db: Session = next(db_gen) |
|
|
|
|
|
try: |
|
|
|
|
|
docs_dir = "docs" |
|
|
md_files = read_markdown_files(docs_dir) |
|
|
|
|
|
print(f"Found {len(md_files)} markdown files to process") |
|
|
|
|
|
for file_path in md_files: |
|
|
print(f"Processing: {file_path}") |
|
|
|
|
|
try: |
|
|
|
|
|
content_text = extract_content_from_file(file_path) |
|
|
|
|
|
|
|
|
if len(content_text.strip()) < 10: |
|
|
print(f"Skipping {file_path}, content too short") |
|
|
continue |
|
|
|
|
|
|
|
|
embedding = qwen_embeddings.get_embedding(content_text[:2000]) |
|
|
|
|
|
|
|
|
book_content = BookContent( |
|
|
title=Path(file_path).stem, |
|
|
content=content_text, |
|
|
source_path=file_path, |
|
|
embedding_vector=str(embedding) |
|
|
) |
|
|
|
|
|
|
|
|
db.add(book_content) |
|
|
|
|
|
|
|
|
content_id = book_content.id |
|
|
vector_db.store_embedding( |
|
|
content_id=content_id, |
|
|
embedding=embedding, |
|
|
content_metadata={ |
|
|
"title": book_content.title, |
|
|
"content": book_content.content[:500], |
|
|
"source_path": book_content.source_path |
|
|
} |
|
|
) |
|
|
|
|
|
print(f"Processed: {file_path}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing {file_path}: {str(e)}") |
|
|
continue |
|
|
|
|
|
|
|
|
db.commit() |
|
|
print("Book content processing completed successfully!") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error during book content processing: {str(e)}") |
|
|
db.rollback() |
|
|
finally: |
|
|
db.close() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
process_book_content() |