paroles-net-scraper/paroles_net_scraper/paroles_net_scraper.py

#!/usr/bin/env python3
"""
Module to fetch song lyrics from paroles.net
"""

import argparse
import re

import requests
from bs4 import BeautifulSoup


def get_song_lyrics(artist, song_title):
    """
    Fetch song lyrics from paroles.net

    Args:
        artist (str): Name of the artist
        song_title (str): Title of the song

    Returns:
        str: Song lyrics or error message
    """
    # Format the URL
    # Convert artist and song to lowercase and replace spaces with hyphens
    formatted_artist = (
        artist.lower().replace(" ", "-").replace("$", "s").replace("&", "and")
    )
    formatted_song = (
        song_title.lower().replace(" ", "-").replace("'", "").replace('"', "")
    )

    url = f"https://www.paroles.net/{formatted_artist}/paroles-{formatted_song}"

    try:
        # Set headers to mimic a browser request
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            )
        }

        # Send GET request
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for bad status codes

        # Parse HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the lyrics container
        # Looking for the div with class 'song-text'
        lyrics_div = soup.find("div", class_="song-text")

        if not lyrics_div:
            return "Lyrics not found on the page"

        # Extract text content
        # Get all text from the div but preserve line breaks
        lyrics_parts = []
        for element in lyrics_div.descendants:
            if element.name == "br":
                lyrics_parts.append("\n")
            elif (
                element.string
                and element.string.strip()
                and "Paroles de la chanson" not in element.string
            ):
                lyrics_parts.append(element.string)

        # Join the parts and clean up
        lyrics = "".join(lyrics_parts).strip()

        # Clean up extra whitespace while preserving verse structure
        lines = lyrics.split("\n")
        cleaned_lines = []
        for line in lines:
            stripped_line = line.strip()
            # Skip empty lines and ad content
            if stripped_line and not re.match(
                r"^(Content_\d+|.*Advertisement.*|\d+\s*)$", stripped_line
            ):
                # Also remove inline ad markers
                cleaned_line = re.sub(r"^Content_\d+\s*", "", stripped_line)
                if cleaned_line:  # Only add non-empty lines
                    cleaned_lines.append(cleaned_line)

        lyrics = "\n".join(cleaned_lines).strip()

        if not lyrics:
            return "Could not extract lyrics from the page"

        return lyrics

    except requests.exceptions.RequestException as e:
        return f"Error fetching lyrics: {str(e)}"
    except Exception as e:
        return f"Error parsing lyrics: {str(e)}"


def search_song(artist, song_title):
    """
    Search for a song on paroles.net and return the first result

    Args:
        artist (str): Name of the artist
        song_title (str): Title of the song

    Returns:
        str: URL of the first search result or error message
    """
    # Format search URL
    search_query = f"{artist} {song_title}"
    search_url = (
        f"https://www.paroles.net/recherche?q={requests.utils.quote(search_query)}"
    )

    try:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            )
        }

        response = requests.get(search_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")

        # Find the first search result link
        first_result = soup.find("a", href=lambda x: x and "/paroles-" in x)

        if first_result:
            return f"https://www.paroles.net{first_result['href']}"
        else:
            return "No search results found"

    except requests.exceptions.RequestException as e:
        return f"Error searching for song: {str(e)}"


def main():
    parser = argparse.ArgumentParser(description="Fetch song lyrics from paroles.net")
    parser.add_argument(
        "query",
        help=(
            "Artist and song in format 'ARTIST - SONG TITLE' or "
            "separate artist and song arguments"
        ),
    )
    parser.add_argument(
        "song", nargs="?", help="Song title (optional if using ARTIST - SONG format)"
    )
    parser.add_argument(
        "--search",
        action="store_true",
        help="Use search functionality instead of direct URL construction",
    )

    args = parser.parse_args()

    # Handle both input formats:
    # 1. Single argument: "ARTIST - SONG TITLE"
    # 2. Two arguments: ARTIST SONG_TITLE
    if args.song is None:
        # Single argument format: split on " - "
        if " - " in args.query:
            artist, song = args.query.split(" - ", 1)
        else:
            print(
                "Error: Please provide artist and song in format 'ARTIST - SONG TITLE'"
            )
            return
    else:
        # Two argument format: artist and song provided separately
        artist = args.query
        song = args.song

    # Strip any leading/trailing whitespace
    artist = artist.strip()
    song = song.strip()

    if args.search:
        # First search for the song to get the correct URL
        search_result = search_song(artist, song)
        if search_result.startswith("http"):
            # Extract artist and song from the URL
            print(f"Found song at: {search_result}")
            # For simplicity, we'll still call get_song_lyrics with original params
            lyrics = get_song_lyrics(artist, song)
            print(lyrics)
        else:
            print(search_result)  # Print error message
    else:
        lyrics = get_song_lyrics(artist, song)
        print(lyrics)


if __name__ == "__main__":
    main()