Spaces:

zeeshan4801
/

CivilEngineeringASTMHelper

Sleeping

App Files Files Community

zeeshan4801 commited on Oct 5, 2025

Commit

74b3e59

verified ·

1 Parent(s): cf8303f

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+%%writefile app.py
+# ============================================
+# Civil Engineering RAG (ASTM) - app.py
+# ============================================
+import os
+import fitz                      # PyMuPDF
+import faiss
+import numpy as np
+import gradio as gr
+from typing import List
+from groq import Groq
+from sentence_transformers import SentenceTransformer
+# --------------------------
+# Config
+# --------------------------
+GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
+if not GROQ_API_KEY:
+    raise RuntimeError("GROQ_API_KEY missing. Set it before running: os.environ['GROQ_API_KEY']='...'")
+# Change these if your filenames differ:
+DOC_PATHS = [
+    "docs/ASTM1.pdf",
+    "docs/ASTM2.pdf",
+]
+# Embedding model (free & small; good for Colab)
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+# --------------------------
+# Clients / Models
+# --------------------------
+client = Groq(api_key=GROQ_API_KEY)
+embedder = SentenceTransformer(EMBED_MODEL)
+# --------------------------
+# PDF text extraction
+# --------------------------
+def extract_text_from_pdf(file_path: str) -> str:
+    text = []
+    with fitz.open(file_path) as doc:
+        for page in doc:
+            text.append(page.get_text("text"))
+    return "\n".join(text)
+# --------------------------
+# Simple character-based chunking with overlap
+# --------------------------
+def chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str]:
+    chunks = []
+    start = 0
+    n = len(text)
+    while start < n:
+        end = min(start + chunk_size, n)
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        start = end - overlap
+        if start < 0:
+            start = 0
+    return chunks
+# --------------------------
+# Build FAISS index
+# --------------------------
+def build_faiss_index(paths: List[str]):
+    texts = []
+    vectors = []
+    for p in paths:
+        if not os.path.exists(p):
+            raise FileNotFoundError(f"Document not found: {p}")
+        raw = extract_text_from_pdf(p)
+        chunks = chunk_text(raw)
+        if not chunks:
+            continue
+        embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
+        texts.extend(chunks)
+        vectors.append(embs.astype("float32"))
+    if not texts:
+        raise RuntimeError("No text extracted from provided PDFs.")
+    vectors = np.vstack(vectors).astype("float32")
+    index = faiss.IndexFlatL2(vectors.shape[1])
+    index.add(vectors)
+    # Persist (optional)
+    os.makedirs("faiss_index", exist_ok=True)
+    faiss.write_index(index, "faiss_index/index.faiss")
+    np.save("faiss_index/corpus.npy", np.array(texts, dtype=object))
+    return index, texts
+def load_or_build_index(paths: List[str]):
+    idx_path = "faiss_index/index.faiss"
+    corpus_path = "faiss_index/corpus.npy"
+    if os.path.exists(idx_path) and os.path.exists(corpus_path):
+        index = faiss.read_index(idx_path)
+        corpus = np.load(corpus_path, allow_pickle=True).tolist()
+        return index, corpus
+    return build_faiss_index(paths)
+# Build on import (so Gradio has it)
+INDEX, CORPUS = load_or_build_index(DOC_PATHS)
+# --------------------------
+# Retrieval
+# --------------------------
+def retrieve_context(query: str, top_k: int = 4) -> str:
+    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
+    distances, indices = INDEX.search(q_emb, top_k)
+    selected = []
+    for i in indices[0]:
+        if 0 <= i < len(CORPUS):
+            selected.append(CORPUS[i])
+    return "\n\n---\n\n".join(selected)
+# --------------------------
+# LLM call via Groq
+# --------------------------
+SYSTEM_PROMPT = (
+    "You are a helpful Civil Engineering assistant. "
+    "Use ONLY the provided ASTM context to answer. "
+    "If the answer isn't in context, say you cannot find it in the provided documents."
+)
+def ask_groq(query: str, top_k: int = 4, model: str = "llama-3.3-70b-versatile") -> str:
+    context = retrieve_context(query, top_k=top_k)
+    prompt = f"""{SYSTEM_PROMPT}
+Context (ASTM excerpts):
+{context}
+Question:
+{query}
+Answer clearly and cite phrases only if present in the context above.
+"""
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.2,
+    )
+    return completion.choices[0].message.content
+# --------------------------
+# Gradio UI
+# --------------------------
+def ui_ask(query: str, top_k: int):
+    try:
+        return ask_groq(query, top_k=top_k)
+    except Exception as e:
+        return f"Error: {e}"
+with gr.Blocks(title="Civil Engineering RAG (ASTM)") as demo:
+    gr.Markdown("# 🏗️ Civil Engineering RAG (ASTM)\nAsk questions grounded in your uploaded ASTM PDFs.")
+    with gr.Row():
+        inp = gr.Textbox(label="Your question", placeholder="e.g., What is the acceptable slump range for Class A concrete?")
+        k = gr.Slider(1, 10, value=4, step=1, label="Top-K passages to retrieve")
+    out = gr.Textbox(label="Answer")
+    btn = gr.Button("Ask")
+    btn.click(ui_ask, inputs=[inp, k], outputs=[out])
+    gr.Markdown("Tip: If you change PDFs, **restart runtime** and re-run cells to rebuild the index.")
+if __name__ == "__main__":
+    demo.launch(share=True)