Source code for autorag.audio_source
from __future__ import annotations
import logging
import tempfile
import urllib.parse
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any
from autorag.errors import _missing_extra
if TYPE_CHECKING:
from collections.abc import Iterator
logger = logging.getLogger(__name__)
_YOUTUBE_HOSTS: frozenset[str] = frozenset(
{
"youtube.com",
"www.youtube.com",
"m.youtube.com",
"music.youtube.com",
"youtu.be",
}
)
[docs]
@dataclass(frozen=True)
class AudioSource:
"""Resolved audio input plus its original-source identity and metadata.
``path`` is the local file the rest of the pipeline reads. ``source_url``
and ``video_id`` are populated only when the input was a YouTube URL.
The remaining fields surface yt-dlp's info dict (title, upload date,
duration, uploader) so downstream persistence can record human-readable
metadata instead of falling back to the temp filename / mtime.
"""
path: Path
source_url: str | None
video_id: str | None
title: str | None = None
upload_date: str | None = None
duration_s: float | None = None
uploader: str | None = None
[docs]
def is_youtube_url(value: str) -> bool:
"""Return True iff ``value`` parses as an http(s) URL on a YouTube host."""
parsed = urllib.parse.urlparse(value)
if parsed.scheme not in ("http", "https"):
return False
host = (parsed.hostname or "").lower()
return host in _YOUTUBE_HOSTS
[docs]
def default_title_from(source: str) -> str:
"""Derive a clip title from a local path or YouTube URL.
YouTube URLs resolve to the video id; local paths resolve to the file
stem. Used as a fallback when neither a caller-supplied title nor a
yt-dlp-provided title is available.
"""
if is_youtube_url(source):
parsed = urllib.parse.urlparse(source)
qs = urllib.parse.parse_qs(parsed.query)
video_id = (qs.get("v", [""])[0] or parsed.path.lstrip("/")).strip("/")
return video_id or "youtube-clip"
return Path(source).stem
def _canonical_youtube_url(url: str) -> str:
"""Return a normalized ``https://www.youtube.com/watch?v=<id>`` URL.
Collapses ``youtu.be/<id>``, ``m.youtube.com/watch?v=<id>``, etc. to a
single canonical string so the same video hashes to the same id under
:func:`uuid.uuid5`. Raises :class:`ValueError` if no video id can be
extracted.
"""
parsed = urllib.parse.urlparse(url)
host = (parsed.hostname or "").lower()
video_id: str | None = None
if host == "youtu.be":
video_id = parsed.path.lstrip("/").split("/", 1)[0] or None
elif host in _YOUTUBE_HOSTS:
qs = urllib.parse.parse_qs(parsed.query)
v = qs.get("v", [""])[0].strip()
if v:
video_id = v
if not video_id:
raise ValueError(f"could not extract YouTube video id from URL: {url}")
return f"https://www.youtube.com/watch?v={video_id}"
def _download_youtube_audio(url: str, dest_dir: Path) -> tuple[Path, dict[str, Any]]:
try:
import yt_dlp
except ModuleNotFoundError as exc:
raise _missing_extra("youtube", exc) from exc
opts: dict[str, object] = {
"format": "bestaudio[ext=webm]/bestaudio",
"outtmpl": str(dest_dir / "%(id)s.%(ext)s"),
"noplaylist": True,
"quiet": True,
"no_warnings": True,
}
logger.info("Downloading YouTube audio: %s", url)
with yt_dlp.YoutubeDL(opts) as ydl:
info = ydl.extract_info(url, download=True)
out = Path(ydl.prepare_filename(info))
if not out.is_file():
raise RuntimeError(f"yt-dlp did not produce expected file: {out}")
if not str(info.get("id") or "").strip():
raise RuntimeError(f"yt-dlp did not return a video id for: {url}")
return out, info