Spaces:

zeeshan4801
/

CivilEngineeringASTMHelper

Sleeping

App Files Files Community

zeeshan4801 commited on Oct 5, 2025

Commit

58159ec

verified ·

1 Parent(s): 0cc0905

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -70

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # ============================================
-# Civil Engineering RAG (ASTM) - app.py
 # ============================================
 import os
 import fitz                      # PyMuPDF
@@ -10,28 +9,19 @@ import gradio as gr
 from typing import List
 from groq import Groq
 from sentence_transformers import SentenceTransformer
 # --------------------------
 # Config
 # --------------------------
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
 if not GROQ_API_KEY:
-    raise RuntimeError("GROQ_API_KEY missing. Set it before running: os.environ['GROQ_API_KEY']='...'")
-# Change these if your filenames differ:
-DOC_PATHS = [
-    "docs/ASTM1.pdf",
-    "docs/ASTM2.pdf",
-]
-# Embedding model (free & small; good for Colab)
-EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-# --------------------------
-# Clients / Models
-# --------------------------
 client = Groq(api_key=GROQ_API_KEY)
-embedder = SentenceTransformer(EMBED_MODEL)
 # --------------------------
 # PDF text extraction
@@ -44,12 +34,10 @@ def extract_text_from_pdf(file_path: str) -> str:
     return "\n".join(text)
 # --------------------------
-# Simple character-based chunking with overlap
 # --------------------------
 def chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str]:
-    chunks = []
-    start = 0
-    n = len(text)
     while start < n:
         end = min(start + chunk_size, n)
         chunk = text[start:end].strip()
@@ -61,81 +49,61 @@ def chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str
     return chunks
 # --------------------------
-# Build FAISS index
 # --------------------------
 def build_faiss_index(paths: List[str]):
-    texts = []
-    vectors = []
     for p in paths:
-        if not os.path.exists(p):
-            raise FileNotFoundError(f"Document not found: {p}")
         raw = extract_text_from_pdf(p)
         chunks = chunk_text(raw)
-        if not chunks:
-            continue
-        embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
         texts.extend(chunks)
         vectors.append(embs.astype("float32"))
-    if not texts:
-        raise RuntimeError("No text extracted from provided PDFs.")
     vectors = np.vstack(vectors).astype("float32")
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(vectors)
-    # Persist (optional)
-    os.makedirs("faiss_index", exist_ok=True)
-    faiss.write_index(index, "faiss_index/index.faiss")
-    np.save("faiss_index/corpus.npy", np.array(texts, dtype=object))
     return index, texts
-def load_or_build_index(paths: List[str]):
-    idx_path = "faiss_index/index.faiss"
-    corpus_path = "faiss_index/corpus.npy"
-    if os.path.exists(idx_path) and os.path.exists(corpus_path):
-        index = faiss.read_index(idx_path)
-        corpus = np.load(corpus_path, allow_pickle=True).tolist()
-        return index, corpus
-    return build_faiss_index(paths)
-# Build on import (so Gradio has it)
-INDEX, CORPUS = load_or_build_index(DOC_PATHS)
 # --------------------------
-# Retrieval
 # --------------------------
 def retrieve_context(query: str, top_k: int = 4) -> str:
     q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
     distances, indices = INDEX.search(q_emb, top_k)
-    selected = []
-    for i in indices[0]:
-        if 0 <= i < len(CORPUS):
-            selected.append(CORPUS[i])
     return "\n\n---\n\n".join(selected)
-# --------------------------
-# LLM call via Groq
-# --------------------------
 SYSTEM_PROMPT = (
     "You are a helpful Civil Engineering assistant. "
-    "Use ONLY the provided ASTM context to answer. "
-    "If the answer isn't in context, say you cannot find it in the provided documents."
 )
 def ask_groq(query: str, top_k: int = 4, model: str = "llama-3.3-70b-versatile") -> str:
-    context = retrieve_context(query, top_k=top_k)
     prompt = f"""{SYSTEM_PROMPT}
-Context (ASTM excerpts):
 {context}
 Question:
 {query}
-Answer clearly and cite phrases only if present in the context above.
 """
     completion = client.chat.completions.create(
         model=model,
@@ -151,17 +119,21 @@ def ui_ask(query: str, top_k: int):
     try:
         return ask_groq(query, top_k=top_k)
     except Exception as e:
-        return f"Error: {e}"
 with gr.Blocks(title="Civil Engineering RAG (ASTM)") as demo:
-    gr.Markdown("# 🏗️ Civil Engineering RAG (ASTM)\nAsk questions grounded in your uploaded ASTM PDFs.")
     with gr.Row():
-        inp = gr.Textbox(label="Your question", placeholder="e.g., What is the acceptable slump range for Class A concrete?")
-        k = gr.Slider(1, 10, value=4, step=1, label="Top-K passages to retrieve")
     out = gr.Textbox(label="Answer")
     btn = gr.Button("Ask")
     btn.click(ui_ask, inputs=[inp, k], outputs=[out])
-    gr.Markdown("Tip: If you change PDFs, **restart runtime** and re-run cells to rebuild the index.")
 if __name__ == "__main__":
-    demo.launch(share=True)

 # ============================================
+# Civil Engineering RAG (ASTM) - Uploadable Version
 # ============================================
 import os
 import fitz                      # PyMuPDF
 from typing import List
 from groq import Groq
 from sentence_transformers import SentenceTransformer
+import tempfile
 # --------------------------
 # Config
 # --------------------------
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
 if not GROQ_API_KEY:
+    raise RuntimeError("❌ Missing GROQ_API_KEY. Add it in Hugging Face → Settings → Secrets.")
 client = Groq(api_key=GROQ_API_KEY)
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+INDEX, CORPUS = None, []
 # --------------------------
 # PDF text extraction
     return "\n".join(text)
 # --------------------------
+# Chunking helper
 # --------------------------
 def chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str]:
+    chunks, start, n = [], 0, len(text)
     while start < n:
         end = min(start + chunk_size, n)
         chunk = text[start:end].strip()
     return chunks
 # --------------------------
+# Build FAISS index from uploaded PDFs
 # --------------------------
 def build_faiss_index(paths: List[str]):
+    texts, vectors = [], []
     for p in paths:
         raw = extract_text_from_pdf(p)
         chunks = chunk_text(raw)
+        embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
         texts.extend(chunks)
         vectors.append(embs.astype("float32"))
     vectors = np.vstack(vectors).astype("float32")
     index = faiss.IndexFlatL2(vectors.shape[1])
     index.add(vectors)
     return index, texts
+def rebuild_index_from_upload(files):
+    if not files:
+        return "⚠️ Please upload at least one PDF."
+    paths = []
+    for f in files:
+        temp_path = os.path.join(tempfile.gettempdir(), f.name)
+        f.save(temp_path)
+        paths.append(temp_path)
+    global INDEX, CORPUS
+    INDEX, CORPUS = build_faiss_index(paths)
+    return f"✅ Indexed {len(paths)} file(s). You can now ask questions!"
 # --------------------------
+# Retrieval + Groq LLM
 # --------------------------
 def retrieve_context(query: str, top_k: int = 4) -> str:
+    if INDEX is None:
+        return "⚠️ Please upload PDFs first."
     q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
     distances, indices = INDEX.search(q_emb, top_k)
+    selected = [CORPUS[i] for i in indices[0] if 0 <= i < len(CORPUS)]
     return "\n\n---\n\n".join(selected)
 SYSTEM_PROMPT = (
     "You are a helpful Civil Engineering assistant. "
+    "Use ONLY the provided ASTM or uploaded document context to answer. "
+    "If the answer isn't in context, say you cannot find it."
 )
 def ask_groq(query: str, top_k: int = 4, model: str = "llama-3.3-70b-versatile") -> str:
+    if INDEX is None:
+        return "⚠️ Please upload PDFs first."
+    context = retrieve_context(query, top_k)
     prompt = f"""{SYSTEM_PROMPT}
+Context:
 {context}
 Question:
 {query}
 """
     completion = client.chat.completions.create(
         model=model,
     try:
         return ask_groq(query, top_k=top_k)
     except Exception as e:
+        return f"❌ Error: {e}"
 with gr.Blocks(title="Civil Engineering RAG (ASTM)") as demo:
+    gr.Markdown("## 🏗️ Civil Engineering RAG\nUpload ASTM or civil-engineering PDFs, build an index, and ask questions.")
     with gr.Row():
+        uploader = gr.File(label="Upload PDFs", file_count="multiple", file_types=[".pdf"])
+        status = gr.Textbox(label="Status", interactive=False)
+    uploader.upload(rebuild_index_from_upload, uploader, status)
+    gr.Markdown("---")
+    inp = gr.Textbox(label="Your Question", placeholder="e.g., What is the standard curing time for concrete?")
+    k = gr.Slider(1, 10, value=4, step=1, label="Top-K passages")
     out = gr.Textbox(label="Answer")
     btn = gr.Button("Ask")
     btn.click(ui_ask, inputs=[inp, k], outputs=[out])
 if __name__ == "__main__":
+    demo.launch()