debaghtk commited on
Commit
ea4e663
·
1 Parent(s): 0e4fa7d

fix reading transcriptions

Browse files
Files changed (1) hide show
  1. app.py +47 -37
app.py CHANGED
@@ -29,45 +29,55 @@ voice_id = os.getenv("ELEVENLABS_VOICE_ID")
29
 
30
  # Load and process the transcripts
31
  documents = []
32
- folder_path = "transcriptions-local" if os.getenv("ENVIRONMENT") == "local" else "transcriptions"
33
-
34
- for filename in os.listdir(folder_path):
35
- if filename.endswith(".txt"):
36
- file_path = os.path.join(folder_path, filename)
37
- with open(file_path, "r", encoding='utf-8') as f:
38
- content = f.read()
39
-
40
- # Extract video ID from filename
41
- base_name = os.path.splitext(filename)[0]
42
- video_id = base_name.replace("_transcription", "")
43
- youtube_link_base = f"https://www.youtube.com/watch?v={video_id}"
44
-
45
- # Parse the transcript content to extract entries with timestamps
46
- lines = content.splitlines()
47
- for line in lines:
48
- # Updated regex to match your transcript format
49
- match = re.match(r"^\[(\d+\.\d+) - (\d+\.\d+)\]\s*(.*)$", line)
50
- if match:
51
- start_time = float(match.group(1))
52
- end_time = float(match.group(2))
53
- text = match.group(3)
54
- # Create a Document for each transcript entry
55
- entry = Document(
56
- page_content=text,
57
- metadata={
58
- "youtube_link": youtube_link_base,
59
- "start_time": start_time,
60
- "end_time": end_time,
61
- "timestamp_link": f"{youtube_link_base}&t={int(start_time)}"
62
- }
63
- )
64
- documents.append(entry)
65
- else:
66
- # Handle other lines if needed
67
- continue
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Check documents length
70
- print(f"Number of documents loaded: {len(documents)}")
71
 
72
  # Continue with splitting documents if necessary
73
  text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
 
29
 
30
  # Load and process the transcripts
31
  documents = []
32
+ folder_path = "transcriptions" if os.getenv("ENVIRONMENT") == "local" else "transcriptions"
33
+
34
+ def process_transcript_file(file_path):
35
+ with open(file_path, "r", encoding='utf-8') as f:
36
+ content = f.read()
37
+
38
+ # Extract video ID from filename
39
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
40
+ video_id = base_name.replace("_transcription", "")
41
+ youtube_link_base = f"https://www.youtube.com/watch?v={video_id}"
42
+
43
+ # Parse the transcript content to extract entries with timestamps
44
+ lines = content.splitlines()
45
+ file_documents = []
46
+ for line in lines:
47
+ # Updated regex to match your transcript format
48
+ match = re.match(r"^\[(\d+\.\d+) - (\d+\.\d+)\]\s*(.*)$", line)
49
+ if match:
50
+ start_time = float(match.group(1))
51
+ end_time = float(match.group(2))
52
+ text = match.group(3)
53
+ # Create a Document for each transcript entry
54
+ entry = Document(
55
+ page_content=text,
56
+ metadata={
57
+ "youtube_link": youtube_link_base,
58
+ "start_time": start_time,
59
+ "end_time": end_time,
60
+ "timestamp_link": f"{youtube_link_base}&t={int(start_time)}",
61
+ "channel": os.path.basename(os.path.dirname(file_path)) # Add channel name from folder
62
+ }
63
+ )
64
+ file_documents.append(entry)
65
+ return file_documents
66
+
67
+ # Recursively walk through all subdirectories
68
+ for root, dirs, files in os.walk(folder_path):
69
+ for filename in files:
70
+ if filename.endswith(".txt"):
71
+ file_path = os.path.join(root, filename)
72
+ try:
73
+ file_documents = process_transcript_file(file_path)
74
+ documents.extend(file_documents)
75
+ print(f"Processed {file_path}: {len(file_documents)} entries")
76
+ except Exception as e:
77
+ print(f"Error processing {file_path}: {str(e)}")
78
 
79
  # Check documents length
80
+ print(f"Total number of documents loaded: {len(documents)}")
81
 
82
  # Continue with splitting documents if necessary
83
  text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)