MELD数据集源于EmotionLines数据集,后者是一个纯文本的对话数据集,来自于经典的电视剧老友记。论文链接:
MELD数据集
获取MELD数据集
下载链接:http://web.eecs.umich.edu/~mihalcea/downloads/MELD.Raw.tar.gz
里面将13000多个MP4视频分成train,dev,test:10:2:3。
解析MELD数据集
但是从链接下载的MELD数据集的train文件夹里面的视频格式是MPEG-4 AAC 解码器,FFMPEG版本不能正确打开,因此,我们做了编码转化处理,转化为MP4格式。
import os
import subprocess
# 定义视频文件所在目录和重新编码后文件的输出目录
input_dir = 'train_splits/'
output_dir = 'train_splits_mp4/'
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 获取输入目录中所有视频文件
video_files = [f for f in os.listdir(input_dir) if f.endswith(('.mp4', '.mov', '.avi', '.mkv'))]
# 定义重新编码视频的函数
def reencode_video(input_path, output_path):
try:
subprocess.run([
'ffmpeg',
'-i', input_path,
'-c:v', 'libx264',
'-c:a', 'aac',
'-strict', 'experimental',
'-b:a', '192k',
'-y', output_path
], check=True)
print(f"Successfully reencoded: {input_path} -> {output_path}")
except subprocess.CalledProcessError as e:
print(f"Error reencoding {input_path}: {e}")
# 重新编码所有视频文件
for video_file in video_files:
input_path = os.path.join(input_dir, video_file)
output_path = os.path.join(output_dir, video_file)
reencode_video(input_path, output_path)
print("所有视频文件已重新编码!")
dia125_utt3.mp4文件缺少问题
https://github.com/declare-lab/MELD/issues/39
构建多模态数据集
我们都以train为例进行说明。由于下载视频同时带有可以进行对应的csv。
下面是csv文件的双语解读。
import pandas as pd
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from moviepy.editor import VideoFileClip
# Load the CSV file
csv_path = 'train_sent_emo.csv'
data = pd.read_csv(csv_path)
# Define the directory containing the visual files
video_dir = 'train_splits/'
# Function to convert time format to seconds
def time_to_seconds(time_str):
h, m, s = time_str.split(':')
s, ms = s.split(',')
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
# Process each row in the CSV file
for index, row in data.iterrows():
sr_no = row['Sr No.']
start_time = time_to_seconds(row['StartTime'])
end_time = time_to_seconds(row['EndTime'])
dialogue_id = row['Dialogue_ID']
utterance_id = row['Utterance_ID']
# Define the visual file name
video_filename = f"{video_dir}dia{dialogue_id}_utt{utterance_id}.mp4"
# Extract the visual segment
output_video_path = f'output/visual/{sr_no}.mp4'
try:
ffmpeg_extract_subclip(video_filename, start_time, end_time, targetname=output_video_path)
# 提取音频片段
video_clip = VideoFileClip(output_video_path)
audio_clip = video_clip.audio
output_audio_path = f'output/audio/{sr_no}.mp3'
audio_clip.write_audiofile(output_audio_path)
# 保存话语文本
output_text_path = f'output/text/{sr_no}.txt'
with open(output_text_path, 'w') as f:
f.write(row['Utterance'])
except OSError as e:
print(f"处理 {sr_no} 时出错:{e}")
except Exception as e:
print(f"处理 {sr_no} 时出现意外错误:{e}")
print("处理完成!")