728x90
gpt 도움으로 작성
import requests
import sys
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import time
# Disable unnecessary warning messages
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
SUPPORTED_SITES = {
"1": "https://www.test1.com/",
"2": "https://www.test2.com/",
"3": "https://www.test3.com/"
}
EXCLUSION_LIST = [
"mailto:", "javascript:", ".png", ".jpg", ".jpeg",
]
def is_excluded(url):
"""Return True if URL should be excluded."""
return any(excl in url for excl in EXCLUSION_LIST)
def prompt_user_for_site():
"""Prompt the user to choose a site."""
print("\nSupported sites:\n")
for key, value in SUPPORTED_SITES.items():
print(f"{key}. {value}")
choice = input("\nSelect a number: ")
return SUPPORTED_SITES.get(choice)
def extract_links_from_html(html):
"""Extract links from HTML."""
return [link["href"] for link in BeautifulSoup(html, features="lxml").select("a[href]")]
def find_broken_links(domain, starting_url, headers):
"""Recursively find and return list of broken links."""
broken = []
to_search = {starting_url}
searched = set()
while to_search:
current_url = to_search.pop()
# Show the user the URL that is currently being checked
print(f"Checking: {current_url}")
if current_url in searched or is_excluded(current_url):
continue
response = requests.get(current_url, headers=headers, allow_redirects=False, verify=False)
if response.status_code == 404:
broken.append((current_url, starting_url))
elif urlparse(current_url).netloc == domain:
new_links = {urljoin(current_url, link) for link in extract_links_from_html(response.text)}
to_search.update(new_links)
searched.add(current_url)
return broken
def main():
"""Main execution function."""
start_time = time.time()
selected_site = prompt_user_for_site()
if not selected_site:
sys.exit()
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/605.1.15"
}
broken_links = find_broken_links(urlparse(selected_site).netloc, selected_site, headers)
print("\n--- Done! ---\n")
if broken_links:
print("Broken links detected:")
for link, source in broken_links:
print(f"\tBROKEN: {link} (from {source})")
else:
print("No broken links detected!")
elapsed_time = time.time() - start_time
print(f"Execution time: {elapsed_time:.2f} seconds")
if __name__ == "__main__":
main()
728x90
'Python' 카테고리의 다른 글
ORM 방식으로 db에 데이터 추가하 (0) | 2023.12.15 |
---|---|
도커로 postgresql 환경 구축 (0) | 2023.09.05 |
yield 한 방에 이해하기 (0) | 2023.08.04 |
edge-tts 사용해보기 (0) | 2023.07.21 |
파이썬 SSL 오류 발생시 (0) | 2023.07.12 |