Source code for domain_extractor

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Functions for extracting domains.
"""

import re


[docs]def extract_top_domain(url):
    """
    Extracts the domain from an url. Subdomains are ignored

    :param url: Url with https:// and /some/path
    :type url: str
    :return: domain name without protocol, subdomains or path
    :rtype: str
    """
    pattern = re.compile("(?:https?://)?(\w+\.)+(co\.\w+).*")
    matches = pattern.match(url)
    if matches and len(matches.groups()) >= 2:
        return matches.group(len(matches.groups()) - 1) + matches.group(len(matches.groups()))
    pattern = re.compile("(?:https?://)?(\w+\.)+(\w+).*")
    matches = pattern.match(url)
    if matches and len(matches.groups()) >= 2:
        return matches.group(len(matches.groups()) - 1) + matches.group(len(matches.groups()))
    else:
        return url


[docs]def extract_full_domain(url):
    """
    Extracts the domain from an url

    :param url: Url with https:// and /some/path
    :type url: str
    :return: domain name without protocol or path
    :rtype: str
    """
    pattern = re.compile("(?:https?://)?((?:\w+\.)*)(\w+).*")
    matches = pattern.match(url)
    if matches and len(matches.groups()) >= 2:
        return "".join(matches.groups())
    else:
        return url