gholap310 commited on
Commit
5198010
·
verified ·
1 Parent(s): c13205b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -78
app.py CHANGED
@@ -1,79 +1,47 @@
1
- {\rtf1\ansi\ansicpg1252\cocoartf2822
2
- \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fmodern\fcharset0 Courier;}
3
- {\colortbl;\red255\green255\blue255;\red131\green0\blue165;\red245\green245\blue245;\red0\green0\blue0;
4
- \red15\green112\blue1;\red86\green65\blue25;\red144\green1\blue18;\red0\green0\blue255;\red0\green0\blue109;
5
- \red19\green85\blue52;}
6
- {\*\expandedcolortbl;;\cssrgb\c59216\c13725\c70588;\cssrgb\c96863\c96863\c96863;\cssrgb\c0\c0\c0;
7
- \cssrgb\c0\c50196\c0;\cssrgb\c41569\c32157\c12941;\cssrgb\c63922\c8235\c8235;\cssrgb\c0\c0\c100000;\cssrgb\c0\c6275\c50196;
8
- \cssrgb\c6667\c40000\c26667;}
9
- \margl1440\margr1440\vieww11520\viewh8400\viewkind0
10
- \deftab720
11
- \pard\pardeftab720\partightenfactor0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- \f0\fs28 \cf2 \cb3 \expnd0\expndtw0\kerning0
14
- \outl0\strokewidth0 \strokec2 import\cf0 \strokec4 json\cb1 \
15
- \cf2 \cb3 \strokec2 from\cf0 \strokec4 sklearn.feature_extraction.text \cf2 \strokec2 import\cf0 \strokec4 TfidfVectorizer\cb1 \
16
- \cf2 \cb3 \strokec2 from\cf0 \strokec4 sklearn.metrics.pairwise \cf2 \strokec2 import\cf0 \strokec4 cosine_similarity\cb1 \
17
- \cf2 \cb3 \strokec2 from\cf0 \strokec4 transformers \cf2 \strokec2 import\cf0 \strokec4 pipeline\cb1 \
18
- \cf2 \cb3 \strokec2 import\cf0 \strokec4 gradio \cf2 \strokec2 as\cf0 \strokec4 gr\cb1 \
19
- \
20
- \pard\pardeftab720\partightenfactor0
21
- \cf5 \cb3 \strokec5 # Load your natural-language corpus\cf0 \cb1 \strokec4 \
22
- \pard\pardeftab720\partightenfactor0
23
- \cf2 \cb3 \strokec2 with\cf0 \strokec4 \cf6 \strokec6 open\cf0 \strokec4 (\cf7 \strokec7 "electricity_corpus.json"\cf0 \strokec4 , \cf7 \strokec7 "r"\cf0 \strokec4 ) \cf2 \strokec2 as\cf0 \strokec4 f:\cb1 \
24
- \pard\pardeftab720\partightenfactor0
25
- \cf0 \cb3 corpus = json.load(f)\cb1 \
26
- \
27
- \pard\pardeftab720\partightenfactor0
28
- \cf5 \cb3 \strokec5 # Build TF-IDF index\cf0 \cb1 \strokec4 \
29
- \pard\pardeftab720\partightenfactor0
30
- \cf0 \cb3 vectorizer = TfidfVectorizer()\cb1 \
31
- \cb3 tfidf_matrix = vectorizer.fit_transform(corpus)\cb1 \
32
- \
33
- \pard\pardeftab720\partightenfactor0
34
- \cf5 \cb3 \strokec5 # Load the QA model\cf0 \cb1 \strokec4 \
35
- \pard\pardeftab720\partightenfactor0
36
- \cf0 \cb3 qa_pipeline = pipeline(\cf7 \strokec7 "question-answering"\cf0 \strokec4 , model=\cf7 \strokec7 "distilbert-base-cased-distilled-squad"\cf0 \strokec4 )\cb1 \
37
- \
38
- \pard\pardeftab720\partightenfactor0
39
- \cf5 \cb3 \strokec5 # Function to retrieve top matching rows\cf0 \cb1 \strokec4 \
40
- \pard\pardeftab720\partightenfactor0
41
- \cf8 \cb3 \strokec8 def\cf0 \strokec4 \cf6 \strokec6 get_top_contexts\cf0 \strokec4 (\cf9 \strokec9 question\cf0 \strokec4 , \cf9 \strokec9 top_k\cf0 \strokec4 =\cf10 \strokec10 3\cf0 \strokec4 ):\cb1 \
42
- \pard\pardeftab720\partightenfactor0
43
- \cf0 \cb3 question_vec = vectorizer.transform([question])\cb1 \
44
- \cb3 similarities = cosine_similarity(question_vec, tfidf_matrix).flatten()\cb1 \
45
- \cb3 top_indices = similarities.argsort()[-top_k:][::\cf10 \strokec10 -1\cf0 \strokec4 ]\cb1 \
46
- \cb3 \cf2 \strokec2 return\cf0 \strokec4 [corpus[i] \cf2 \strokec2 for\cf0 \strokec4 i \cf8 \strokec8 in\cf0 \strokec4 top_indices]\cb1 \
47
- \
48
- \pard\pardeftab720\partightenfactor0
49
- \cf5 \cb3 \strokec5 # Main logic to get answer\cf0 \cb1 \strokec4 \
50
- \pard\pardeftab720\partightenfactor0
51
- \cf8 \cb3 \strokec8 def\cf0 \strokec4 \cf6 \strokec6 answer_question\cf0 \strokec4 (\cf9 \strokec9 question\cf0 \strokec4 , \cf9 \strokec9 top_k\cf0 \strokec4 =\cf10 \strokec10 3\cf0 \strokec4 ):\cb1 \
52
- \pard\pardeftab720\partightenfactor0
53
- \cf0 \cb3 \cf2 \strokec2 if\cf0 \strokec4 \cf8 \strokec8 not\cf0 \strokec4 question.strip():\cb1 \
54
- \cb3 \cf2 \strokec2 return\cf0 \strokec4 \cf7 \strokec7 "Please enter a valid question."\cf0 \cb1 \strokec4 \
55
- \
56
- \cb3 contexts = get_top_contexts(question, top_k)\cb1 \
57
- \cb3 combined_context = \cf7 \strokec7 " "\cf0 \strokec4 .join(contexts)[:\cf10 \strokec10 4096\cf0 \strokec4 ] \cf5 \strokec5 # truncate to model max input\cf0 \cb1 \strokec4 \
58
- \cb3 result = qa_pipeline(question=question, context=combined_context)\cb1 \
59
- \cb3 \cf2 \strokec2 return\cf0 \strokec4 result[\cf7 \strokec7 "answer"\cf0 \strokec4 ]\cb1 \
60
- \
61
- \pard\pardeftab720\partightenfactor0
62
- \cf5 \cb3 \strokec5 # Gradio interface\cf0 \cb1 \strokec4 \
63
- \pard\pardeftab720\partightenfactor0
64
- \cf0 \cb3 iface = gr.Interface(\cb1 \
65
- \cb3 fn=answer_question,\cb1 \
66
- \cb3 inputs=gr.Textbox(label=\cf7 \strokec7 "Ask your question about electricity usage..."\cf0 \strokec4 ),\cb1 \
67
- \cb3 outputs=gr.Textbox(label=\cf7 \strokec7 "Answer"\cf0 \strokec4 ),\cb1 \
68
- \cb3 title=\cf7 \strokec7 "\uc0\u55357 \u56588 Electricity Data Q&A"\cf0 \strokec4 ,\cb1 \
69
- \cb3 description=\cf7 \strokec7 "Ask questions like 'What was the price for residential in Texas in Jan 2001?' or 'Which state had highest revenue in Jan 2001?'"\cf0 \strokec4 ,\cb1 \
70
- \cb3 )\cb1 \
71
- \
72
- \pard\pardeftab720\partightenfactor0
73
- \cf5 \cb3 \strokec5 # Run the app\cf0 \cb1 \strokec4 \
74
- \pard\pardeftab720\partightenfactor0
75
- \cf2 \cb3 \strokec2 if\cf0 \strokec4 \cf9 \strokec9 __name__\cf0 \strokec4 == \cf7 \strokec7 "__main__"\cf0 \strokec4 :\cb1 \
76
- \pard\pardeftab720\partightenfactor0
77
- \cf0 \cb3 iface.launch()\cb1 \
78
- \
79
- }
 
1
+ import json
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from transformers import pipeline
5
+ import gradio as gr
6
+
7
+ # Load your natural-language corpus
8
+ with open("electricity_corpus.json", "r") as f:
9
+ corpus = json.load(f)
10
+
11
+ # Build TF-IDF index
12
+ vectorizer = TfidfVectorizer()
13
+ tfidf_matrix = vectorizer.fit_transform(corpus)
14
+
15
+ # Load the QA model
16
+ qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
17
+
18
+ # Function to retrieve top matching rows
19
+ def get_top_contexts(question, top_k=3):
20
+ question_vec = vectorizer.transform([question])
21
+ similarities = cosine_similarity(question_vec, tfidf_matrix).flatten()
22
+ top_indices = similarities.argsort()[-top_k:][::-1]
23
+ return [corpus[i] for i in top_indices]
24
+
25
+ # Main logic to get answer
26
+ def answer_question(question, top_k=3):
27
+ if not question.strip():
28
+ return "Please enter a valid question."
29
+
30
+ contexts = get_top_contexts(question, top_k)
31
+ combined_context = " ".join(contexts)[:4096] # truncate to model max input
32
+ result = qa_pipeline(question=question, context=combined_context)
33
+ return result["answer"]
34
+
35
+ # Gradio interface
36
+ iface = gr.Interface(
37
+ fn=answer_question,
38
+ inputs=gr.Textbox(label="Ask your question about electricity usage..."),
39
+ outputs=gr.Textbox(label="Answer"),
40
+ title="🔌 Electricity Data Q&A",
41
+ description="Ask questions like 'What was the price for residential in Texas in Jan 2001?' or 'Which state had highest revenue in Jan 2001?'",
42
+ )
43
+
44
+ # Run the app
45
+ if __name__ == "__main__":
46
+ iface.launch()
47