faster whisper从多媒体语音材料中抽取出文本-2

为脚本添加每个音频的时长统计和每个音频转换所有的耗时统计

安装依赖

pip install faster-whisper pydub
"""
批量转录当前目录下的 .mp3 文件,使用 faster-whisper
新增功能:
- 每个音频的时长(秒)
- 每个音频的转录耗时(秒)
- 总计统计:总音频时长、总转录耗时、平均实时倍率
"""
import os
import sys
import time
from pathlib import Path
from typing import List, Tuple

from faster_whisper import WhisperModel
from pydub import AudioSegment


# ================== 配置区 ==================
MODEL_SIZE = "small"      # 可选: tiny, base, small, medium, large
DEVICE = "cpu"            # cpu 或 cuda
COMPUTE_TYPE = "int8"     # int8, float16, float32 (CPU 推荐 int8)
VAD_FILTER = True         # 启用语音活动检测,去除静音
OUTPUT_FORMAT = "txt"     # 只输出 .txt
VERBOSE = True            # 是否显示详细日志
# ===========================================


def get_audio_duration(audio_path: Path) -> float:
    """使用 pydub 获取音频时长(秒)"""
    try:
        audio = AudioSegment.from_file(str(audio_path))
        return len(audio) / 1000.0  # 毫秒 → 秒
    except Exception as e:
        print(f"无法获取 {audio_path.name} 时长: {e}", file=sys.stderr)
        return 0.0


def transcribe_audio(
    audio_path: Path, model: WhisperModel
) -> Tuple[str, float, float]:
    """
    转录单个音频文件
    返回: (文本内容, 音频时长秒, 转录耗时秒)
    """
    duration = get_audio_duration(audio_path)
    print(f"转录: {audio_path.name} ({duration:.2f}s) → {audio_path.stem}.txt")

    start_time = time.perf_counter()
    segments, info = model.transcribe(
        str(audio_path),
        language=None,           # 自动检测
        beam_size=5,
        vad_filter=VAD_FILTER,
        vad_parameters=dict(min_silence_duration_ms=500),
        word_timestamps=False,
    )
    elapsed = time.perf_counter() - start_time

    text_lines = []
    for segment in segments:
        line = segment.text.strip()
        text_lines.append(line)
        if VERBOSE:
            print(f"[{segment.start:06.2f}s --> {segment.end:06.2f}s] {line}", flush=True)

    return "n".join(text_lines), duration, elapsed


def format_time(seconds: float) -> str:
    """将秒数格式化为 h:mm:ss"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = seconds % 60
    return f"{hours}:{minutes:02d}:{secs:05.2f}"


def main():
    print("=== faster-whisper 批量转录(带时长与耗时统计)===")

    current_dir = Path(".")
    mp3_files = sorted(current_dir.glob("*.mp3"))

    if not mp3_files:
        print("未找到 .mp3 文件,退出。")
        return

    # 加载模型(只加载一次)
    print(f"正在加载模型 {MODEL_SIZE} ({DEVICE}, {COMPUTE_TYPE})...")
    model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)

    processed = 0
    total_audio_duration = 0.0
    total_transcribe_time = 0.0
    results = []

    for mp3_path in mp3_files:
        txt_path = mp3_path.with_suffix(".txt")
        if txt_path.exists():
            duration = get_audio_duration(mp3_path)
            print(f"跳过: {txt_path.name} 已存在 ({duration:.2f}s)")
            total_audio_duration += duration
            continue

        try:
            text, duration, elapsed = transcribe_audio(mp3_path, model)
            txt_path.write_text(text, encoding="utf-8")

            total_audio_duration += duration
            total_transcribe_time += elapsed
            processed += 1

            rtf = elapsed / duration if duration > 0 else float('inf')
            print(f"完成: {mp3_path.name} | 时长 {duration:.2f}s | 耗时 {elapsed:.2f}s | RTF {rtf:.2f}x")
            results.append((mp3_path.name, duration, elapsed, rtf))

        except Exception as e:
            print(f"错误转录 {mp3_path.name}: {e}", file=sys.stderr)

    # ================== 汇总统计 ==================
    print("n" + "=" * 60)
    print("转录完成汇总")
    print("=" * 60)
    print(f"成功处理文件数   : {processed}")
    print(f"总音频时长       : {format_time(total_audio_duration)}")
    print(f"总转录耗时       : {format_time(total_transcribe_time)}")
    if total_audio_duration > 0:
        avg_rtf = total_transcribe_time / total_audio_duration
        print(f"平均实时倍率(RTF): {avg_rtf:.2f}x")
    else:
        print(f"平均实时倍率(RTF): N/A")

    if results:
        print(f"n明细列表:")
        print(f"{'文件名':<40} {'音频时长':>10} {'转录耗时':>10} {'RTF':>8}")
        print("-" * 70)
        for name, dur, ela, rtf in results:
            print(f"{name:<40} {dur:10.2f}s {ela:10.2f}s {rtf:8.2f}x")

    print("=" * 60)


if __name__ == "__main__":
    main()

Similar Posts