본문 바로가기
Python

깨진 링크 검사기 (broken link checker)

by 앗사비 2023. 8. 23.
728x90

gpt 도움으로 작성

 

import requests
import sys
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import time

# Disable unnecessary warning messages
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)


SUPPORTED_SITES = {
    "1": "https://www.test1.com/",
    "2": "https://www.test2.com/",
    "3": "https://www.test3.com/"
}


EXCLUSION_LIST = [
        "mailto:", "javascript:", ".png", ".jpg", ".jpeg",
    ]


def is_excluded(url):
    """Return True if URL should be excluded."""
    return any(excl in url for excl in EXCLUSION_LIST)


def prompt_user_for_site():
    """Prompt the user to choose a site."""
    print("\nSupported sites:\n")
    for key, value in SUPPORTED_SITES.items():
        print(f"{key}. {value}")
    choice = input("\nSelect a number: ")
    return SUPPORTED_SITES.get(choice)


def extract_links_from_html(html):
    """Extract links from HTML."""
    return [link["href"] for link in BeautifulSoup(html, features="lxml").select("a[href]")]


def find_broken_links(domain, starting_url, headers):
    """Recursively find and return list of broken links."""
    broken = []
    to_search = {starting_url}
    searched = set()

    while to_search:
        current_url = to_search.pop()
        
        # Show the user the URL that is currently being checked
        print(f"Checking: {current_url}")
        
        if current_url in searched or is_excluded(current_url):
            continue

        response = requests.get(current_url, headers=headers, allow_redirects=False, verify=False)

        if response.status_code == 404:
            broken.append((current_url, starting_url))
        elif urlparse(current_url).netloc == domain:
            new_links = {urljoin(current_url, link) for link in extract_links_from_html(response.text)}
            to_search.update(new_links)

        searched.add(current_url)

    return broken



def main():
    """Main execution function."""
    start_time = time.time()

    selected_site = prompt_user_for_site()
    if not selected_site:
        sys.exit()

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/605.1.15"
    }

    broken_links = find_broken_links(urlparse(selected_site).netloc, selected_site, headers)

    print("\n--- Done! ---\n")
    if broken_links:
        print("Broken links detected:")
        for link, source in broken_links:
            print(f"\tBROKEN: {link} (from {source})")
    else:
        print("No broken links detected!")

    elapsed_time = time.time() - start_time
    print(f"Execution time: {elapsed_time:.2f} seconds")


if __name__ == "__main__":
    main()
728x90

'Python' 카테고리의 다른 글

ORM 방식으로 db에 데이터 추가하  (0) 2023.12.15
도커로 postgresql 환경 구축  (0) 2023.09.05
yield 한 방에 이해하기  (0) 2023.08.04
edge-tts 사용해보기  (0) 2023.07.21
파이썬 SSL 오류 발생시  (0) 2023.07.12