Continuous video transcription worker process.
Runs an infinite loop to process untranscribed videos by:
-
Retrieving untranscribed videos from MongoDB
-
Downloading the video from YouTube
-
Converting to WAV
-
Transcribing with stable-whisper
library and large-v3 model
-
Storing results
Source code in apps/annotator/code/transcribe.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128 | def main():
"""
Continuous video transcription worker process.
Runs an infinite loop to process untranscribed videos by:\n
1. Retrieving untranscribed videos from MongoDB\n
2. Downloading the video from YouTube\n
3. Converting to WAV\n
4. Transcribing with `stable-whisper` library and large-v3 model\n
5. Storing results\n
"""
# TODO stable-ts version 2.17.3: passing the language is not working, will be inferenced at cost of small increase in time
# self._model.transcribe(wav_path.__str__(), decode_options={"language":language}) \
# .save_as_json(json_path.__str__())
model = stable_whisper.load_model(name='large-v3', in_memory=True, cpu_preload=True)
print("Model loaded")
from pathlib import Path
base_folder = Path(__file__).parent.joinpath("static").joinpath("videos")
from database.mongo import get_untranscribed_videos, insert_video_data, get_video_data, remove_annotations_data
from time import sleep, time
from json import load
from media.audio import convert_mp4_to_wav
from media.segmentation import VideoAnalyzer
import os
try:
while True:
try:
videos_metadata:list = get_untranscribed_videos()
print(f"Jobs: {videos_metadata}")
except Exception as e:
import sys
import os
import traceback
tb_details = traceback.extract_tb(sys.exc_info()[2])
print(f"Exception: {e}")
for frame in tb_details:
filename = os.path.basename(frame.filename)
# Read the specific line of code
line_number = frame.lineno
with open(frame.filename, 'r') as f:
lines = f.readlines()
error_line = lines[line_number - 1].strip()
print(f"File: {filename}, Function: {frame.name}, Line: {line_number} | {error_line}")
# If there is an error at network level sleep and try again reconnecting
sleep(300)
from env import MONGO_CLUSTER_USERNAME, MONGO_CLUSTER_PASSWORD
import pymongo
global client
global db
client = pymongo.MongoClient(
"mongodb+srv://"+MONGO_CLUSTER_USERNAME+":"+MONGO_CLUSTER_PASSWORD+"@clusteredurell.z8aeh.mongodb.net/ekeel?retryWrites=true&w=majority")
db = client.ekeel
continue
for (video_id, language) in videos_metadata:
print(f"New job: {video_id}")
start_time = time()
video_folder_path = base_folder.joinpath(video_id)
try:
VideoAnalyzer("https://www.youtube.com/watch?v="+video_id, request_fields_from_db=["video_id"]).download_video()
convert_mp4_to_wav(video_folder_path, video_id)
except Exception as e:
print(e)
sleep(300)
continue
wav_path = video_folder_path.joinpath(video_id+".wav")
json_path = video_folder_path.joinpath(video_id+".json")
model.transcribe(wav_path.__str__()).save_as_json(json_path.__str__())
with open(json_path) as f:
transcribed_data = load(f)["segments"]
os.remove(wav_path)
#os.remove(json_path) # Don't remove json for debug purposes
video_data = get_video_data(video_id)
video_data["transcript_data"] = {
"is_whisper_transcribed":True,
"is_autogenerated":True,
"text":transcribed_data
}
insert_video_data(video_data,update=False)
remove_annotations_data(video_id)
print(f"Done job: {video_id} in {round(time()-start_time,1)} seconds")
sleep(60)
except Exception as e:
import sys
import os
import traceback
tb_details = traceback.extract_tb(sys.exc_info()[2])
print(f"Exception: {e}")
for frame in tb_details:
filename = os.path.basename(frame.filename)
# Read the specific line of code
line_number = frame.lineno
with open(frame.filename, 'r') as f:
lines = f.readlines()
error_line = lines[line_number - 1].strip()
print(f"File: {filename}, Function: {frame.name}, Line: {line_number} | {error_line}")
|