Spaces:

tharunchndrn
/

Syslink_Chatbot

Build error

App Files Files Community

tharunchndrn commited on 22 days ago

Commit

cedc2e4

verified ·

1 Parent(s): eef6b96

Update backend_app/ingest.py

Browse files

Files changed (1) hide show

backend_app/ingest.py +213 -111

backend_app/ingest.py CHANGED Viewed

@@ -1,112 +1,214 @@
-import os
-import json
-import pickle
-from typing import List, Dict
-import numpy as np
-import faiss
-from sentence_transformers import SentenceTransformer
-from .config import (
-    DATA_DIR,
-    URLS_PATH,
-    FAISS_INDEX_PATH,
-    DOCSTORE_PATH,
-    EMBED_MODEL_NAME,
-)
-from .fetcher import fetch_page_text
-def ensure_data_dir():
-    os.makedirs(DATA_DIR, exist_ok=True)
-def load_urls() -> List[str]:
-    """
-    Expects data/urls.json like:
-    {
-      "urls": ["https://...", "https://..."]
-    }
-    """
-    if not os.path.exists(URLS_PATH):
-        raise FileNotFoundError(
-            f"Missing {URLS_PATH}. Create it with your 4 URLs."
-        )
-    with open(URLS_PATH, "r", encoding="utf-8") as f:
-        obj = json.load(f)
-    urls = obj.get("urls", [])
-    if not urls:
-        raise ValueError("urls.json has no URLs. Add at least 1 URL.")
-    return urls
-def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]:
-    """
-    Simple word-based chunking (fast + reliable).
-    """
-    words = text.split()
-    chunks = []
-    i = 0
-    step = max(1, chunk_size_words - overlap_words)
-    while i < len(words):
-        chunk = words[i:i + chunk_size_words]
-        chunks.append(" ".join(chunk))
-        i += step
-    return chunks
-def build_docs_from_urls(urls: List[str]) -> List[Dict]:
-    docs: List[Dict] = []
-    for url in urls:
-        page = fetch_page_text(url, use_cache=True)
-        chunks = chunk_text(page["text"])
-        for idx, ch in enumerate(chunks):
-            docs.append({
-                "text": ch,
-                "meta": {
-                    "url": page["url"],
-                    "title": page["title"],
-                    "chunk": idx
-                }
-            })
-    return docs
-def build_faiss_index(docs: List[Dict]) -> None:
-    model = SentenceTransformer(EMBED_MODEL_NAME)
-    texts = [d["text"] for d in docs]
-    emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
-    emb = np.array(emb, dtype="float32")
-    index = faiss.IndexFlatIP(emb.shape[1])
-    index.add(emb)
-    faiss.write_index(index, FAISS_INDEX_PATH)
-    with open(DOCSTORE_PATH, "wb") as f:
-        pickle.dump(docs, f)
-def run_ingestion():
-    ensure_data_dir()
-    urls = load_urls()
-    docs = build_docs_from_urls(urls)
-    if not docs:
-        raise RuntimeError("No documents created from URLs. Check your URLs/pages.")
-    build_faiss_index(docs)
-    print("✅ Ingestion complete")
-    print(f"URLs: {len(urls)}")
-    print(f"Chunks: {len(docs)}")
-    print(f"Saved index: {FAISS_INDEX_PATH}")
-    print(f"Saved docs:  {DOCSTORE_PATH}")
-if __name__ == "__main__":
     run_ingestion()

+import os
+import json
+import pickle
+from typing import List, Dict, Tuple
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from pypdf import PdfReader
+from .config import (
+    DATA_DIR,
+    URLS_PATH,
+    FAISS_INDEX_PATH,
+    DOCSTORE_PATH,
+    EMBED_MODEL_NAME,
+)
+from .fetcher import fetch_page_text
+DOCS_DIR = os.path.join(DATA_DIR, "docs")
+def ensure_data_dir():
+    os.makedirs(DATA_DIR, exist_ok=True)
+    os.makedirs(DOCS_DIR, exist_ok=True)  # safe even if empty
+def load_urls() -> List[str]:
+    """
+    Expects data/urls.json like:
+    { "urls": ["https://...", "https://..."] }
+    """
+    if not os.path.exists(URLS_PATH):
+        # If urls.json missing, we allow ingestion to continue with local docs only
+        return []
+    with open(URLS_PATH, "r", encoding="utf-8") as f:
+        obj = json.load(f)
+    urls = obj.get("urls", [])
+    return [u.strip() for u in urls if isinstance(u, str) and u.strip()]
+def chunk_text(text: str, chunk_size_words: int = 900, overlap_words: int = 150) -> List[str]:
+    """
+    Simple word-based chunking (fast + reliable).
+    """
+    text = (text or "").strip()
+    if not text:
+        return []
+    words = text.split()
+    chunks = []
+    i = 0
+    step = max(1, chunk_size_words - overlap_words)
+    while i < len(words):
+        chunk = words[i:i + chunk_size_words]
+        chunks.append(" ".join(chunk))
+        i += step
+    return chunks
+# -------------------------
+# URL ingestion
+# -------------------------
+def build_docs_from_urls(urls: List[str]) -> List[Dict]:
+    docs: List[Dict] = []
+    for url in urls:
+        try:
+            page = fetch_page_text(url, use_cache=True)
+            chunks = chunk_text(page.get("text", ""))
+            for idx, ch in enumerate(chunks):
+                docs.append({
+                    "text": ch,
+                    "meta": {
+                        "source_type": "url",
+                        "url": page.get("url", url),
+                        "title": page.get("title", url),
+                        "chunk": idx,
+                    }
+                })
+        except Exception:
+            # skip bad URLs but continue ingestion
+            continue
+    return docs
+# -------------------------
+# Local docs ingestion
+# -------------------------
+def list_local_files() -> List[str]:
+    """
+    Reads local files from data/docs/
+    Supported: .txt, .md, .pdf (text-based PDFs)
+    """
+    if not os.path.exists(DOCS_DIR):
+        return []
+    paths = []
+    for name in os.listdir(DOCS_DIR):
+        p = os.path.join(DOCS_DIR, name)
+        if not os.path.isfile(p):
+            continue
+        ext = os.path.splitext(name)[1].lower()
+        if ext in [".txt", ".md", ".pdf"]:
+            paths.append(p)
+    return sorted(paths)
+def read_text_file(path: str) -> str:
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
+def read_pdf_text(path: str) -> str:
+    """
+    Works best on selectable-text PDFs.
+    Scanned/image-only PDFs will extract very little.
+    """
+    reader = PdfReader(path)
+    parts = []
+    for page in reader.pages:
+        try:
+            parts.append(page.extract_text() or "")
+        except Exception:
+            continue
+    return "\n".join(parts).strip()
+def build_docs_from_files(file_paths: List[str]) -> List[Dict]:
+    docs: List[Dict] = []
+    for path in file_paths:
+        name = os.path.basename(path)
+        ext = os.path.splitext(name)[1].lower()
+        try:
+            if ext in [".txt", ".md"]:
+                text = read_text_file(path)
+            elif ext == ".pdf":
+                text = read_pdf_text(path)
+            else:
+                continue
+        except Exception:
+            continue
+        chunks = chunk_text(text)
+        for idx, ch in enumerate(chunks):
+            docs.append({
+                "text": ch,
+                "meta": {
+                    "source_type": "file",
+                    "url": f"file://{name}",
+                    "title": name,
+                    "chunk": idx,
+                }
+            })
+    return docs
+# -------------------------
+# Index building
+# -------------------------
+def build_faiss_index(docs: List[Dict]) -> None:
+    model = SentenceTransformer(EMBED_MODEL_NAME)
+    texts = [d["text"] for d in docs]
+    emb = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
+    emb = np.array(emb, dtype="float32")
+    index = faiss.IndexFlatIP(emb.shape[1])
+    index.add(emb)
+    faiss.write_index(index, FAISS_INDEX_PATH)
+    with open(DOCSTORE_PATH, "wb") as f:
+        pickle.dump(docs, f)
+def run_ingestion():
+    ensure_data_dir()
+    urls = load_urls()
+    url_docs = build_docs_from_urls(urls) if urls else []
+    file_paths = list_local_files()
+    file_docs = build_docs_from_files(file_paths) if file_paths else []
+    docs = url_docs + file_docs
+    if not docs:
+        raise RuntimeError(
+            "No documents found.\n"
+            "- Add URLs to data/urls.json OR\n"
+            "- Add files to data/docs/ (.txt, .md, .pdf)"
+        )
+    build_faiss_index(docs)
+    print("✅ Ingestion complete")
+    print(f"URLs: {len(urls)}")
+    print(f"Local files: {len(file_paths)}")
+    print(f"Chunks: {len(docs)}")
+    print(f"Saved index: {FAISS_INDEX_PATH}")
+    print(f"Saved docs:  {DOCSTORE_PATH}")
+if __name__ == "__main__":
     run_ingestion()