Files
paroles-net-scraper/paroles_net_scraper/paroles_net_scraper.py
Rene Luria 43a8203f70 feat: Support artist - song format in command line arguments
Updated the CLI to accept arguments in two formats:
1. Single argument with dash separator: "ARTIST - SONG"
2. Two separate arguments: ARTIST SONG

This makes it more convenient for users to input artist and song information.
Also updated README to document both usage formats.
2025-08-11 14:28:04 +02:00

170 lines
6.0 KiB
Python

#!/usr/bin/env python3
"""
Module to fetch song lyrics from paroles.net
"""
import requests
from bs4 import BeautifulSoup
import argparse
import re
def get_song_lyrics(artist, song_title):
"""
Fetch song lyrics from paroles.net
Args:
artist (str): Name of the artist
song_title (str): Title of the song
Returns:
str: Song lyrics or error message
"""
# Format the URL
# Convert artist and song to lowercase and replace spaces with hyphens
formatted_artist = artist.lower().replace(' ', '-').replace('$', 's').replace('&', 'and')
formatted_song = song_title.lower().replace(' ', '-').replace('\'', '').replace('"', '')
url = f"https://www.paroles.net/{formatted_artist}/paroles-{formatted_song}"
try:
# Set headers to mimic a browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Send GET request
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise exception for bad status codes
# Parse HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the lyrics container
# Looking for the div with class 'song-text'
lyrics_div = soup.find('div', class_='song-text')
if not lyrics_div:
return "Lyrics not found on the page"
# Extract text content
# Get all text from the div but preserve line breaks
lyrics_parts = []
for element in lyrics_div.descendants:
if element.name == 'br':
lyrics_parts.append('\n')
elif element.string and element.string.strip():
# Skip the heading that repeats the song info
if 'Paroles de la chanson' not in element.string:
lyrics_parts.append(element.string)
# Join the parts and clean up
lyrics = ''.join(lyrics_parts).strip()
# Clean up extra whitespace while preserving verse structure
lines = lyrics.split('\n')
cleaned_lines = []
for line in lines:
stripped_line = line.strip()
# Skip empty lines and ad content
if stripped_line and not re.match(r'^(Content_\d+|.*Advertisement.*|\d+\s*)$', stripped_line):
# Also remove inline ad markers
cleaned_line = re.sub(r'^Content_\d+\s*', '', stripped_line)
if cleaned_line: # Only add non-empty lines
cleaned_lines.append(cleaned_line)
lyrics = '\n'.join(cleaned_lines).strip()
if not lyrics:
return "Could not extract lyrics from the page"
return lyrics
except requests.exceptions.RequestException as e:
return f"Error fetching lyrics: {str(e)}"
except Exception as e:
return f"Error parsing lyrics: {str(e)}"
def search_song(artist, song_title):
"""
Search for a song on paroles.net and return the first result
Args:
artist (str): Name of the artist
song_title (str): Title of the song
Returns:
str: URL of the first search result or error message
"""
# Format search URL
search_query = f"{artist} {song_title}"
search_url = f"https://www.paroles.net/recherche?q={requests.utils.quote(search_query)}"
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find the first search result link
first_result = soup.find('a', href=lambda x: x and '/paroles-' in x)
if first_result:
return f"https://www.paroles.net{first_result['href']}"
else:
return "No search results found"
except requests.exceptions.RequestException as e:
return f"Error searching for song: {str(e)}"
def main():
parser = argparse.ArgumentParser(description='Fetch song lyrics from paroles.net')
parser.add_argument('query', help='Artist and song in format "ARTIST - SONG TITLE" or separate artist and song arguments')
parser.add_argument('song', nargs='?', help='Song title (optional if using ARTIST - SONG format)')
parser.add_argument('--search', action='store_true', help='Use search functionality instead of direct URL construction')
args = parser.parse_args()
# Handle both input formats:
# 1. Single argument: "ARTIST - SONG TITLE"
# 2. Two arguments: ARTIST SONG_TITLE
if args.song is None:
# Single argument format: split on " - "
if " - " in args.query:
artist, song = args.query.split(" - ", 1)
else:
print("Error: Please provide artist and song in format 'ARTIST - SONG TITLE'")
return
else:
# Two argument format: artist and song provided separately
artist = args.query
song = args.song
# Strip any leading/trailing whitespace
artist = artist.strip()
song = song.strip()
if args.search:
# First search for the song to get the correct URL
search_result = search_song(artist, song)
if search_result.startswith("http"):
# Extract artist and song from the URL
print(f"Found song at: {search_result}")
# For simplicity, we'll still call get_song_lyrics with original params
lyrics = get_song_lyrics(artist, song)
print(lyrics)
else:
print(search_result) # Print error message
else:
lyrics = get_song_lyrics(artist, song)
print(lyrics)
if __name__ == "__main__":
main()