MuhammadAhmad332 commited on
Commit
196febe
Β·
verified Β·
1 Parent(s): d141170

Upload 3 files

Browse files
Files changed (3) hide show
  1. .env +3 -0
  2. app.py +180 -0
  3. requirements.txt +6 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GROQ_API_KEY=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
2
+ BRIGHT_API_KEY=ded1bc00-4d84-4dee-b4d6-c896cadf2417
3
+ BRIGHT_ZONE=web_unlocker2
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ from groq import Groq
6
+ import gradio as gr
7
+ from dotenv import load_dotenv
8
+ from youtube_transcript_api import YouTubeTranscriptApi
9
+ import re
10
+
11
+ load_dotenv()
12
+
13
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
14
+ BRIGHT_API_KEY = os.getenv("BRIGHT_API_KEY")
15
+ BRIGHT_ZONE = os.getenv("BRIGHT_ZONE")
16
+
17
+ client = Groq(api_key=GROQ_API_KEY)
18
+
19
+
20
+ # BRIGHT DATA WEB UNLOCKER
21
+
22
+ def brightdata_request(target_url):
23
+ response = requests.post(
24
+ "https://api.brightdata.com/request",
25
+ headers={
26
+ "Content-Type": "application/json",
27
+ "Authorization": f"Bearer {BRIGHT_API_KEY}"
28
+ },
29
+ json={
30
+ "zone": BRIGHT_ZONE,
31
+ "url": target_url,
32
+ "format": "raw"
33
+ }
34
+ )
35
+
36
+ return response.text
37
+
38
+
39
+
40
+ # TAB 1 β€” GOODREADS SCRAPER
41
+
42
+ def scrape_goodreads(url):
43
+
44
+ html = brightdata_request(url)
45
+ print("HTML length:", len(html))
46
+ soup = BeautifulSoup(html, "html.parser")
47
+
48
+ books = []
49
+ rows = soup.find_all("tr")
50
+
51
+ for row in rows:
52
+ title_tag = row.find("a", class_="bookTitle")
53
+ author_tag = row.find("a", class_="authorName")
54
+ rating_tag = row.find("span", class_="minirating")
55
+
56
+ if title_tag and author_tag and rating_tag:
57
+ books.append({
58
+ "title": title_tag.get_text(strip=True),
59
+ "author": author_tag.get_text(strip=True),
60
+ "rating": rating_tag.get_text(strip=True)
61
+ })
62
+
63
+ print("Extracted books:", books[:3])
64
+ return books[:10]
65
+
66
+
67
+ def qa_bot(url, question):
68
+ books = scrape_goodreads(url)
69
+
70
+ if not books:
71
+ return "No book data found."
72
+
73
+ context = "\n".join(
74
+ [f"{i+1}. {b['title']} by {b['author']} - {b['rating']}"
75
+ for i, b in enumerate(books)]
76
+ )
77
+
78
+ system_prompt = f"""
79
+ You are a helpful assistant.
80
+ Here is book data scraped from Goodreads:
81
+
82
+ {context}
83
+
84
+ Answer questions ONLY using this data.
85
+ """
86
+
87
+ response = client.chat.completions.create(
88
+ model="llama-3.1-8b-instant",
89
+ messages=[
90
+ {"role": "system", "content": system_prompt},
91
+ {"role": "user", "content": question}
92
+ ]
93
+ )
94
+
95
+ return response.choices[0].message.content
96
+
97
+
98
+
99
+ # TAB 2 β€” YOUTUBE TRANSCRIPT
100
+
101
+
102
+ def extract_video_id(url):
103
+ pattern = r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})"
104
+ match = re.search(pattern, url)
105
+ return match.group(1) if match else None
106
+
107
+
108
+ def get_youtube_transcript(url):
109
+ video_id = extract_video_id(url)
110
+
111
+ if not video_id:
112
+ return "Invalid YouTube URL."
113
+
114
+ try:
115
+ api = YouTubeTranscriptApi()
116
+ transcript = api.fetch(video_id)
117
+ full_text = " ".join([entry.text for entry in transcript])
118
+ return full_text
119
+ except Exception as e:
120
+ print("Transcript ERROR:", str(e))
121
+ return "No transcript available for this video."
122
+
123
+
124
+ def youtube_qa(video_url, question):
125
+ transcript = get_youtube_transcript(video_url)
126
+
127
+ if transcript.startswith("No") or transcript.startswith("Invalid"):
128
+ return transcript
129
+
130
+ system_prompt = f"""
131
+ You are a helpful assistant.
132
+ Answer ONLY using the transcript below.
133
+
134
+ Transcript:
135
+ {transcript[:6000]}
136
+ """
137
+
138
+ response = client.chat.completions.create(
139
+ model="llama-3.1-8b-instant",
140
+ messages=[
141
+ {"role": "system", "content": system_prompt},
142
+ {"role": "user", "content": question}
143
+ ]
144
+ )
145
+
146
+ return response.choices[0].message.content
147
+
148
+
149
+
150
+ # GRADIO UI β€” VERSION 2
151
+
152
+
153
+ with gr.Blocks() as demo:
154
+ gr.Markdown("# Version 2 β€” Website + YouTube Q&A")
155
+
156
+ # TAB 1
157
+ with gr.Tab("🌐 Website Scraper Q&A"):
158
+ # url_input = gr.Textbox(label="Enter Goodreads URL")
159
+ url_input = gr.Textbox(
160
+ label="Enter Goodreads URL",
161
+ value="https://www.goodreads.com/list/show/1.Best_Books_Ever"
162
+ )
163
+ question_input = gr.Textbox(label="Ask your question")
164
+ output1 = gr.Textbox(label="Answer")
165
+ btn1 = gr.Button("Submit")
166
+
167
+ btn1.click(qa_bot, inputs=[url_input, question_input], outputs=output1)
168
+
169
+ # TAB 2
170
+ with gr.Tab("πŸŽ₯ YouTube Transcript Q&A"):
171
+ video_input = gr.Textbox(label="Enter YouTube URL")
172
+ yt_question = gr.Textbox(label="Ask your question")
173
+ output2 = gr.Textbox(label="Answer")
174
+ btn2 = gr.Button("Submit")
175
+
176
+ btn2.click(youtube_qa, inputs=[video_input, yt_question], outputs=output2)
177
+
178
+
179
+ if __name__ == "__main__":
180
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ groq
3
+ requests
4
+ beautifulsoup4
5
+ python-dotenv
6
+ youtube-transcript-api