ExtractBadContentScript

[[ltr]]

This is a script to extract external links from spammed pages making ready to paste link into BadContent.

"""
Extract links from spam and return ready to paste regular expressions.
"""

import sys
import re
import urlparse

urlPattern = re.compile(r'\bhttps?://[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]+',
                        re.IGNORECASE)

def extractPatterns(text):
    patterns = {}
    for link in urlPattern.findall(text):
        # antispam care only about the network location
        netloc = urlparse.urlparse(link)[1]
        # Ignore www subdomain
        netloc = netloc.replace('www.', '')
        netloc = netloc.replace('.', '\.')
        patterns[netloc] = None
    return patterns.keys()
    

def run():
    text = file(sys.argv[1]).read()
    patterns = extractPatterns(text)
    print '\n'.join(patterns)


if __name__ == '__main__':
    run()
        

last edited 2005-07-30 14:15:29 by ניר סופר