StevenMSAI commited on
Commit
ab25a0d
·
verified ·
1 Parent(s): 092e8f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -27
app.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  import faiss
5
  import numpy as np
6
  from pypdf import PdfReader
 
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import pipeline
9
 
@@ -18,14 +19,69 @@ embedder = SentenceTransformer(EMBED_MODEL_NAME)
18
  generator = pipeline("text2text-generation", model=GEN_MODEL_NAME)
19
 
20
  # ---- PDF to text ----
21
- def pdfs_to_texts(files):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  texts = []
23
- for f in files:
24
- # f is an object from Gradio that read bytes for pypdf
25
- reader = PdfReader(io.BytesIO(f.read()))
26
- pages = [page.extract_text() or "" for page in reader.pages]
27
- texts.append("\n".join(pages))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  return texts
 
29
 
30
 
31
  # ---- Chunking ----
@@ -46,30 +102,32 @@ corpus_chunks = []
46
 
47
  def build_index(files, progress=gr.Progress()):
48
  global index, corpus_chunks
49
- texts = pdfs_to_texts(files)
50
-
51
- # basic cleanup + chunk
52
- corpus_chunks = []
53
- for t in texts:
54
- if not t.strip():
55
- continue
56
- corpus_chunks += chunk_text(t)
 
 
 
 
 
57
 
58
- if not corpus_chunks:
59
- return "No text extracted from PDFs.", None
 
60
 
61
- progress(0.3, desc="Embedding chunks…")
62
- embeddings = embedder.encode(corpus_chunks, convert_to_numpy=True, show_progress_bar=False)
63
- d = embeddings.shape[1]
64
 
65
- progress(0.6, desc="Creating FAISS index…")
66
- index = faiss.IndexFlatIP(d) # cosine via inner product on normalized vectors
67
- # normalize to unit length to approximate cosine similarity
68
- norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10
69
- embeddings = embeddings / norms
70
- index.add(embeddings.astype(np.float32))
71
 
72
- return f"Indexed {len(corpus_chunks)} chunks.", len(corpus_chunks)
73
 
74
  # ---- RAG query -> retrieve -> generate ----
75
  def answer_question(question, top_k=5, max_new_tokens=256):
@@ -99,7 +157,7 @@ with gr.Blocks(title="Group 5 Study Helper (RAG)") as demo:
99
  gr.Markdown("# Group 5 Study Helper (RAG)\nUpload PDFs → Build Index → Ask questions.")
100
 
101
  with gr.Row():
102
- file_in = gr.Files(file_types=[".pdf"], label="Upload PDF files")
103
  with gr.Row():
104
  build_btn = gr.Button("Build Index", variant="primary")
105
  status = gr.Markdown()
 
4
  import faiss
5
  import numpy as np
6
  from pypdf import PdfReader
7
+ from docx import Document
8
  from sentence_transformers import SentenceTransformer
9
  from transformers import pipeline
10
 
 
19
  generator = pipeline("text2text-generation", model=GEN_MODEL_NAME)
20
 
21
  # ---- PDF to text ----
22
+ def read_pdf_from_path_or_bytes(file_obj_or_path):
23
+
24
+ path = getattr(file_obj_or_path, "path", None)
25
+ if isinstance(file_obj_or_path, str) and os.path.exists(file_obj_or_path):
26
+ path = file_obj_or_path
27
+ if path and os.path.exists(path):
28
+ reader = PdfReader(path)
29
+ return "\n".join((p.extract_text() or "") for p in reader.pages)
30
+
31
+ data = None
32
+ if hasattr(file_obj_or_path, "read"):
33
+ data = file_obj_or_path.read()
34
+ elif hasattr(file_obj_or_path, "bytes"):
35
+ data = file_obj_or_path.bytes
36
+ if data:
37
+ reader = PdfReader(io.BytesIO(data))
38
+ return "\n".join((p.extract_text() or "") for p in reader.pages)
39
+
40
+ return ""
41
+
42
+
43
+ def read_docx_text(path):
44
+ doc = Document(path)
45
+ return "\n".join(p.text for p in doc.paragraphs)
46
+
47
+
48
+ def load_files_to_texts(files):
49
+ """
50
+ Accepts mixed uploads (.pdf, .docx, .txt).
51
+ Returns a list[str] of raw texts (one per file).
52
+ """
53
  texts = []
54
+ for f in files or []:
55
+ path = getattr(f, "path", None) or getattr(f, "name", None)
56
+ name = (path or str(f)).lower()
57
+
58
+ if name.endswith(".pdf"):
59
+ texts.append(read_pdf_from_path_or_bytes(f if path is None else path))
60
+
61
+ elif name.endswith(".docx"):
62
+ if path:
63
+ texts.append(read_docx_text(path))
64
+ else:
65
+ # Need a real path for python-docx
66
+ data = f.read() if hasattr(f, "read") else getattr(f, "bytes", b"")
67
+ import tempfile
68
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tf:
69
+ tf.write(data)
70
+ tmp_path = tf.name
71
+ texts.append(read_docx_text(tmp_path))
72
+ os.unlink(tmp_path)
73
+
74
+ elif name.endswith(".txt"):
75
+ if path and os.path.exists(path):
76
+ with open(path, "r", errors="ignore") as fh:
77
+ texts.append(fh.read())
78
+ else:
79
+ data = f.read().decode("utf-8", errors="ignore") if hasattr(f, "read") else ""
80
+ texts.append(data)
81
+ else:
82
+ continue
83
  return texts
84
+
85
 
86
 
87
  # ---- Chunking ----
 
102
 
103
  def build_index(files, progress=gr.Progress()):
104
  global index, corpus_chunks
105
+ try:
106
+ texts = load_files_to_texts(files)
107
+ corpus_chunks = []
108
+ for t in texts:
109
+ if t and t.strip():
110
+ corpus_chunks += chunk_text(t)
111
+
112
+ if not corpus_chunks:
113
+ return "No text extracted from files.", 0
114
+
115
+ progress(0.3, desc="Embedding chunks…")
116
+ embeddings = embedder.encode(corpus_chunks, convert_to_numpy=True, show_progress_bar=False)
117
+ d = embeddings.shape[1]
118
 
119
+ # Normalize for cosine sim with inner product
120
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10
121
+ embeddings = embeddings / norms
122
 
123
+ progress(0.6, desc="Creating FAISS index…")
124
+ index = faiss.IndexFlatIP(d)
125
+ index.add(embeddings.astype(np.float32))
126
 
127
+ return f"Indexed {len(corpus_chunks)} chunks.", len(corpus_chunks)
128
+ except Exception as e:
129
+ return f"Build failed: {e}", 0
 
 
 
130
 
 
131
 
132
  # ---- RAG query -> retrieve -> generate ----
133
  def answer_question(question, top_k=5, max_new_tokens=256):
 
157
  gr.Markdown("# Group 5 Study Helper (RAG)\nUpload PDFs → Build Index → Ask questions.")
158
 
159
  with gr.Row():
160
+ file_in = gr.Files(file_types=[".pdf", ".docx", ".txt"], label="Upload PDF/DOCX/TXT files")
161
  with gr.Row():
162
  build_btn = gr.Button("Build Index", variant="primary")
163
  status = gr.Markdown()