Source code for domain_extractor

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Functions for extracting domains.
"""

import re


[docs]def extract_top_domain(url): """ Extracts the domain from an url. Subdomains are ignored :param url: Url with https:// and /some/path :type url: str :return: domain name without protocol, subdomains or path :rtype: str """ pattern = re.compile("(?:https?://)?(\w+\.)+(co\.\w+).*") matches = pattern.match(url) if matches and len(matches.groups()) >= 2: return matches.group(len(matches.groups()) - 1) + matches.group(len(matches.groups())) pattern = re.compile("(?:https?://)?(\w+\.)+(\w+).*") matches = pattern.match(url) if matches and len(matches.groups()) >= 2: return matches.group(len(matches.groups()) - 1) + matches.group(len(matches.groups())) else: return url
[docs]def extract_full_domain(url): """ Extracts the domain from an url :param url: Url with https:// and /some/path :type url: str :return: domain name without protocol or path :rtype: str """ pattern = re.compile("(?:https?://)?((?:\w+\.)*)(\w+).*") matches = pattern.match(url) if matches and len(matches.groups()) >= 2: return "".join(matches.groups()) else: return url