AjayArmugam commited on
Commit
18ba5e5
·
verified ·
1 Parent(s): 4cd4aac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -43
app.py CHANGED
@@ -16,74 +16,119 @@ db = None
16
  def process_pdf(file):
17
  global db
18
 
19
- doc = fitz.open(file.name)
20
- text = ""
 
21
 
22
- for i, page in enumerate(doc[:50]):
23
- page_text = page.get_text()
24
 
25
- if len(page_text.strip()) < 50:
26
- pix = page.get_pixmap()
27
- img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
28
- pix.height, pix.width, pix.n
29
- )
30
- result = reader.readtext(img)
31
- page_text = " ".join([r[1] for r in result])
32
 
33
- text += page_text + "\n"
 
34
 
35
- splitter = RecursiveCharacterTextSplitter(
36
- chunk_size=500,
37
- chunk_overlap=100
38
- )
39
- chunks = splitter.split_text(text)
40
 
41
- embeddings = HuggingFaceEmbeddings()
42
- db = FAISS.from_texts(chunks, embeddings)
43
 
44
- return "✅ PDF processed! Ask your question now."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  # ===== ANSWER FUNCTION =====
48
  def get_answer(query):
49
  global db
50
 
51
- if db is None:
52
- return "⚠️ Upload and process a PDF first.", ""
 
 
 
53
 
54
- docs = db.similarity_search(query, k=3)
 
 
55
 
56
- best_sentence = ""
57
- best_score = 0
58
- source = ""
59
 
60
- for doc in docs:
61
- sentences = doc.page_content.split(".")
62
 
63
- for sent in sentences:
64
- sent_clean = sent.strip()
65
 
66
- if len(sent_clean) < 20:
67
- continue
68
 
69
- score = fuzz.partial_ratio(query.lower(), sent_clean.lower())
 
 
70
 
71
- if "is" in sent_clean.lower() or "mode" in sent_clean.lower():
72
- score += 10
 
 
73
 
74
- if score > best_score:
75
- best_score = score
76
- best_sentence = sent_clean
77
- source = doc.page_content[:200]
78
 
79
- return best_sentence, source
 
 
 
 
 
 
 
80
 
81
 
82
  # ===== CHAT =====
83
  def chat(user_input, history):
84
- answer, source = get_answer(user_input)
85
- history.append((user_input, answer + "\n\n📌 Source: " + source))
86
- return "", history
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
  # ===== FEEDBACK =====
 
16
  def process_pdf(file):
17
  global db
18
 
19
+ try:
20
+ if file is None:
21
+ return "⚠️ Please upload a PDF first."
22
 
23
+ # 🔥 FIX: Works in HuggingFace
24
+ doc = fitz.open(stream=file.read(), filetype="pdf")
25
 
26
+ text = ""
 
 
 
 
 
 
27
 
28
+ for i, page in enumerate(doc[:50]):
29
+ page_text = page.get_text()
30
 
31
+ if len(page_text.strip()) < 50:
32
+ pix = page.get_pixmap()
33
+ img = np.frombuffer(
34
+ pix.samples, dtype=np.uint8
35
+ ).reshape(pix.height, pix.width, pix.n)
36
 
37
+ result = reader.readtext(img)
38
+ page_text = " ".join([r[1] for r in result])
39
 
40
+ text += page_text + "\n"
41
+
42
+ if not text.strip():
43
+ return "⚠️ No text found in PDF."
44
+
45
+ # chunking
46
+ splitter = RecursiveCharacterTextSplitter(
47
+ chunk_size=500,
48
+ chunk_overlap=100
49
+ )
50
+ chunks = splitter.split_text(text)
51
+
52
+ if len(chunks) == 0:
53
+ return "⚠️ Failed to create chunks."
54
+
55
+ # embeddings
56
+ embeddings = HuggingFaceEmbeddings()
57
+ db = FAISS.from_texts(chunks, embeddings)
58
+
59
+ return "✅ PDF processed! Ask your question now."
60
+
61
+ except Exception as e:
62
+ print("PROCESS ERROR:", e)
63
+ return "❌ Error processing PDF."
64
 
65
 
66
  # ===== ANSWER FUNCTION =====
67
  def get_answer(query):
68
  global db
69
 
70
+ try:
71
+ if db is None:
72
+ return "⚠️ Upload and process a PDF first.", ""
73
+
74
+ docs = db.similarity_search(query, k=3)
75
 
76
+ best_sentence = ""
77
+ best_score = 0
78
+ source = ""
79
 
80
+ for doc in docs:
81
+ sentences = doc.page_content.split(".")
 
82
 
83
+ for sent in sentences:
84
+ sent_clean = sent.strip()
85
 
86
+ if len(sent_clean) < 20:
87
+ continue
88
 
89
+ score = fuzz.partial_ratio(query.lower(), sent_clean.lower())
 
90
 
91
+ # boost definition-like lines
92
+ if "is" in sent_clean.lower() or "mode" in sent_clean.lower():
93
+ score += 10
94
 
95
+ if score > best_score:
96
+ best_score = score
97
+ best_sentence = sent_clean
98
+ source = doc.page_content[:200]
99
 
100
+ # 🔥 FIX: never return empty
101
+ if not best_sentence:
102
+ best_sentence = "❌ No relevant answer found."
 
103
 
104
+ if not source:
105
+ source = "No source available."
106
+
107
+ return best_sentence, source
108
+
109
+ except Exception as e:
110
+ print("ANSWER ERROR:", e)
111
+ return "⚠️ Error while generating answer.", ""
112
 
113
 
114
  # ===== CHAT =====
115
  def chat(user_input, history):
116
+ try:
117
+ if not user_input.strip():
118
+ return "", history
119
+
120
+ answer, source = get_answer(user_input)
121
+
122
+ history.append(
123
+ (user_input, answer + "\n\n📌 Source: " + source)
124
+ )
125
+
126
+ return "", history
127
+
128
+ except Exception as e:
129
+ print("CHAT ERROR:", e)
130
+ history.append((user_input, "⚠️ Something went wrong."))
131
+ return "", history
132
 
133
 
134
  # ===== FEEDBACK =====