Rgular Expressions / Regex / Regexp

######## CHAPTER 11

######## REGULAR EXPRESSIONS -- A Fun and Interesting Topic

######## 5:55:24

# Regular Expression / Regex / Regexp

# In computing they provides a concise and flexible means for matching "strings" of text,

# SUch as: particular characters, words, or patterns of characters.

# They (regex) are written in formal language that can be interpreted by a "regex-processor".

# Uses of Regex:

# you can do smart searching

# Regex are really clever "wild card" expressions for matching and parsing strings.

# They are almost programmable wild card expressions, there is no looping, but there is looping.

# and there is all those implicit things. e.g. you say look for patterns that look like this or that.

# and then you get back things that match those pattern.

# We do searching for everything. Regex are very structured way to go about "searching" for information or "finding" a particular outcome.

# ----------------------------

### Understanding Regular Expressions

# - very powerful and quite cryptic (with Mysterious Meaning / hard to understand)

# - Fun once you understand them

# - Regular expressions are language unto themselves

# - A language of "Market character" - programming with characters

# - It is kind of an "old school" language - compact. It comes from around 1960

# ============================================

### Regular Expression Quick Guide

# ^ Matches the "beginning" of a line

# $ Matches the "end" of a line

# . Matches "any" character

# \s Matches "whitespace"

# \S Matches any "non-whitespace" character

# * Repeats a character "zero" or "more" times

# *? Repeats a character "zero" or "more" times (non-greedy)

# + Repeats a character "one" or "more" times

# +? Repeats a character "one" or "more" times (non-greedy)

# [aeiou] Matches a single character "in" the listed "set"

# [^XYZ] Matches a single character "not in" the listed "set"

# [a-z0-9] The set of characters can include a "range"

# ( Indicates where string "extraction" is to start

# ) Indicates where a string "extraction" is to end

# ============================================

### The Regexp Module

# To use regex, import the library using "import re"

# re-library-dir:

# ['A', 'ASCII', 'DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE',

# 'M', 'MULTILINE', 'Match', 'Pattern', 'RegexFlag', 'S', 'Scanner',

# 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE',

# '__all__', '__builtins__', '__cached__', '__doc__', '__file__',

# '__loader__', '__name__', '__package__', '__spec__', '__version__',

# '_cache', '_compile', '_compile_repl', '_expand', '_locale',

# '_pickle', '_special_chars_map', '_subx', 'compile', 'copyreg',

# 'enum', 'error', 'escape', 'findall', 'finditer', 'fullmatch',

# 'functools', 'match', 'purge', 'search', 'split', 'sre_compile',

# 'sre_parse', 'sub', 'subn', 'template']

### Following are just one characters in dir-regex-library:

# A, I, L, M, S, T U, X

# You can use "re.search()" to see if a string matches a regular expression;

# similar to using the "find()" method for strings

# You can use "re.findall()" to extract portions of a string that matches your regular expression;

# similar to a combination of "find()" and slicing: "var[5:10]"

# ============================================

# Code without "Regex"

hand = open('emaildata')

for line in hand:

line = line.rstrip() # using ".rstrip()" to avoid new line at the end of every line

if line.find('From: ') >= 0: # here, alternatively, we can use regex to find particular characters in file.

print(line)

# ---------------

# code with regular expression library.

# Code without "Regex"

#hand = open('emaildata')

#for line in hand:

# line = line.rstrip() # using ".rstrip()" to avoid new line at the end of every line

# if line.find('From: ') >= 0: # here we can use regex to find particular characters in file.

# print(line)

# ---------------

import re # has to import library, otherwise regex-code will not work.

hand = open('emaildata')

for line in hand:

line = line.rstrip()

if re.search('From: ', line):

print(line)

# we will say " if re.search('From: ', line): # like this:

# within the library regular expression. go find the "search-function", and search for the string "from: ", in the string "line".

# -----------------------

# using re.search() like startswith().

# We fine-tune what is matched by adding special character to the string.

hand = open('emaildata')

for line in hand:

line = line.rstrip()

if line.startswith('From: '):

print(line)

# -----------------------

## Alternatively to ".startswith(), regex-code uses "^" character.

#hand = open('emaildata')

#for line in hand:

# line = line.rstrip()

# if line.startswith('From: '):

# print(line)

import re

hand = open('emaildata')

for line in hand:

line = line.rstrip()

if re.search("^From: ", line): # ^ -- matches the beginning of a line. So, this code will then ignore if "From:" appears in the middle of the line.

print(line)

# "^" -- played the same role as ".startswith()"

# Hence fine tuning the search.

# using "^" -- we have turned the first parameter into "code". "^From: ". now its a code.

# So the difference is: in other case we look for the method like ".startswith()" but in regex we programe the regular expression like "^From: ".

# ^ == carrot -- that's the pronounciation.

# you runout of methods in the string / class long before the things you can to with Regular Expression (or Regex/Regexp)

# ============================================

### Wild-Card Characters ###

# ^X.*:

# ^ = match at start of line

# X = a character that matches itself

# . = a wild-card matches any character

# * = Many characters

import re

hand = open('emaildata')

for line in hand:

line = line.rstrip()

if re.search("^X.*", line):

print(line)

# --------------------

## Fine-Tuning your Match ##

import re

hand = open('emaildata')

for line in hand:

line = line.rstrip()

# if re.search("^X.*:", line):

# print(line)

# Depending on how "clean" your data is and the purpose of your application, you may want to narrow your match down a bit.

if re.search("^X-\S+:", line):

print(line)

# ==========================

## From Matching to extraction. -- starts @ 6:05:42

# What we will do?

# Use what we have learned in previous chapter to pull data out of strings, using regular-expression-library.

## Matching and EXtracting ##

import re

x = 'My 2 favorite numbers are 19 and 42'

# using only: "[0-9]"

y = re.findall('[0-9]', x)

print(y) # ['2', '1', '9', '4', '2']

# using with "+" sign: "[0-9]+"

y = re.findall('[0-9]+', x)

print(y) # ['2', '19', '42']

# ---------------------------

y = re.findall('[0-4]', x)

print(y) # ['2', '1', '4', '2']

# using with "+" sign: "[0-9]+"

y = re.findall('[0-4]+', x)

print(y) # ['2', '1', '42']

# So, [0-9] is just a range we can manipulate as per our requirement.

# ---------------------------

# "re.search()" return a True/False depending on whether the string matches the regular expression.

# if we actually want the matching string to be extracted, we use "re.findall()

# The output of .findall will be in the form of a list.

# Just for fun:::

y = re.search('[0-9]', x)

print(y) # <re.Match object; span=(3, 4), match='2'>

y = re.search('[0-9]+', x)

print(y) # <re.Match object; span=(3, 4), match='2'>

#So, re.search() is not working same as re.findall. infact its not working at all i guess.

# But in "re.search()" changing range to [0-1] gives different results

y = re.search('[0-1]', x)

print(y) # <re.Match object; span=(26, 27), match='1'>

y = re.search('[0-1]+', x)

print(y) # <re.Match object; span=(26, 27), match='1'>

# But i am still confused, what does all this means.

# Remamber: instructer said that search gives True/False. Let's see when i can confirm it again in comparison with current problem.

# -----------------------------

# "[AEIOU]", setting parameter of alphabets.

# in case nothing match in .findall() then it gives an empty list, not True/False like .search()

# in .findall(), operation will be case sensitive. AND watch for the specific alphabet you are looking for

import re

x = 'My 2 favorite numbers are 19 and 42'

y = re.findall('[0-9]+', x)

print(y) # ['2', '19', '42']

y = re.findall('[AEIOU]+', x)

print(y) # []

y = re.findall('[AEIOU]', x)

print(y) # []

# ---------------

y = re.findall('[MFtv]+', x)

print(y) # ['M', 'v', 't']

# Hence, its case sensitive and look only for the alphabets you are asking for

# ------------------------

y = re.findall('[a-z]', x)

print(y) # ['y', 'f', 'a', 'v', 'o', 'r', 'i', 't', 'e', 'n', 'u', 'm', 'b', 'e', 'r', 's', 'a', 'r', 'e', 'a', 'n', 'd']

y = re.findall('[a-z]+', x)

print(y) # ['y', 'favorite', 'numbers', 'are', 'and']

# Note that it does not extract capital letter "M".

# ------------------------

y = re.findall('[aeiou]', x)

print(y) # ['a', 'o', 'i', 'e', 'u', 'e', 'a', 'e', 'a']

y = re.findall('[aeiou]+', x)

print(y) # ['a', 'o', 'i', 'e', 'u', 'e', 'a', 'e', 'a']

# ============================================

# Warning: Greedy Matching

# the "repeat" characters (* and +) push "outward" in both directions (greedy) to match the largest possible string.

# It chooses the largest possible (of the overlaping) string

# Anytime it has option, it chooses the largest one.

import re

x = 'From: Using the : character'

y = re.findall('^F.+:', x) # ['From: Using the :']

print(y)

# Why not 'From:'?

# Explaination:

# ^F - First character in the match is an F

# : - Last character in the match is a :

# .+ - One or more characters

# ============================================

## Non-Greedy MAtching

# if you add a "?" character, the "+" and "*" chill out a bit;

# adding ? means, it prefers the shorter of the strings.

import re

x = 'From: Using the : character'

y = re.findall('^F.+?:', x) # ['From:']

print(y)

# Why this time only 'From:'?

# Explaination:

# ^F - First character in the match is an F

# : - Last character in the match is a :

# .+? - One or more characters "but" not greedy

# --------------------------------

# Greedy is the default

# Non-Greedy is the optional.

# --------------------------------

# ============================================

# Fine-Tuning String Extraction #

# you can refine the match for "re.findall()" and separately determine which portion of the match -

# is to be extracted by using parenthesis "()".

# Though we have done this with other techniques as well.

import re

x = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'

y = re.findall('\S+@\S+', x)

print(y) # ['stephen.marquard@uct.ac.za']

# if not using + (i.e. one or more character) it would stop just around @:

y = re.findall('\S@\S', x)

print(y) # ['d@u']

# --------------------------------

# OR:

y = re.findall('\S+@\S', x)

print(y) # ['stephen.marquard@u']

# or:

y = re.findall('\S@\S+', x)

print(y) # ['d@uct.ac.za']

# --------------------------------

# OR:

y = re.findall('@\S', x)

print(y) # ['@u']

# or:

y = re.findall('\S@', x)

print(y) # ['d@']

# ---------------------------

#======================================

# Fine-Tuning String Extraction #

# Paranthesis "()" are not part of mathc

# - but they tell where to "start" and "stop", what string to extract.

# We can give it a matching string that is different from the extracting string by adding paranthesis "()".

import re

x = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'

# Without ():

y = re.findall('From \S+@\S+', x)

print(y) # ['From stephen.marquard@uct.ac.za']

# again without ():

y = re.findall('^From \S+@\S+', x)

print(y) # ['From stephen.marquard@uct.ac.za']

# With ():

y = re.findall('^From (\S+@\S+)', x)

print(y) # ['stephen.marquard@uct.ac.za']

# ============================================

# String PArcing Examples # -- Starts @ 6:14:01

# Now we are going to know couple of ways to use these new found skills.

## Regular Expressions: String Parcing @ 6:14:08

# some more practical applications or regular expression:

# 1: extracting a host name - using find and string slicing:

data = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'

atpos = data.find('@')

print(atpos) # 21

sppos = data.find(' ', atpos)

print(sppos) # 31

host = data[atpos+1: sppos]

print(host) # uct.ac.za

# ----------------------------

# Now we are going to know couple of ways to use these new found skills.

## Regular Expressions: String Parcing @ 6:14:08

# Double Split Pattern -- @nd method

line = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'

words = line.split()

print(words) # ['From', 'stephen.marquard@uct.ac.za', 'Sat', 'Jan', '5', '09:14:16', '2008']

email = words[1]

pieces = email.split('@')

print(pieces) # ['stephen.marquard', 'uct.ac.za']

print(pieces[1]) # uct.ac.za

# ----------------------------

# Regex Expression

import re

lin = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'

y = re.findall('@\S+', lin)

print(y) # ['@uct.ac.za'] -- But his includes "@" in it.

# So better solution for this particular requirement will be: (eith the help of "paranthesis-()")

y = re.findall('@(\S+)', lin)

print(y) # ['uct.ac.za']

# OR

y = re.findall('@([^ ]*)', lin)

print(y) # ['uct.ac.za']

# Explaination:

# [^ ] -- means match non-blank character

# * -- Match many of them

# Fine Tuning

y = re.findall('^From .*@([^ ]*)', lin)

print(y)

# Keep check on spacing between characters.

# ============================================

# Spam Confidence #

import re

hand = open('emaildata')

numlist = list()

for line in hand:

line = line.rstrip()

stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)

#print(numlist) # [] -- commented out

# explaination:

# if the line does't have this -- e.g. "0.8475" or missing in some way

# whether prefix missing or the number is missing

# its gonna fail too.

# then we are going to get empty list "[]".

# so the firs thing you gotta do is to check if you got a match.

# this will be done by adding if statement. That will skip all lines that do not match.

if len(stuff) != 1:

continue # means skip all lines that have length of stuff not equal to 1, i.e. they have zero or more than one items. but we need to compare only one item in one line with one item from all other lines

# print(stuff) # commented out.

num = float(stuff[0])

numlist.append(num)

# print(numlist) # [0.8475, 0.6178, 0.6961, 0.7565, 0.7626, 0.7556, 0.7002, 0.7615, 0.7601, 0.7605, 0.6959, 0.7606, 0.7559, 0.7605, 0.6932, 0.7558, 0.6526, 0.6948, 0.6528, 0.7002, 0.7554, 0.6956, 0.6959, 0.7556, 0.9846, 0.8509, 0.9907]

print('Maximum: ', max(numlist)) # Maximum: 0.9907

# Clean code of above example:

# Spam Confidence #

import re

hand = open('emaildata')

numlist = list()

for line in hand:

line = line.rstrip()

stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)

if len(stuff) != 1:

continue

num = float(stuff[0])

numlist.append(num)

print('Maximum: ', max(numlist)) # Maximum: 0.9907

# ============================================

# Escape Character #

# if you want a special regular expression character (e.g $ -- MAtches the end of the line) to just behave normally

# (most of the time) you prefix it with "\" -- (that is back slash).

import re

x = 'We just received $ 10.00 for cookies'

y = re.findall('\$ [0-9.]+', x)

print(y) # ['$ 10.00']

# where:

# \$ a real dollar sign

# [0-9.] a digit or period

# + atleast one or more.

pak = 'We just received 10.00 pkr for cookies'

y = re.findall('[0-9.]+', x)

print(y) # ['10.00'] # But i have many doubts about this code. lets see in future.

# ============================================

### Summary ###

# Regular expressions are a cryptic but powerful language for

# matching strings and extracting elements from those strings.

# Regular expressions have special characters that indicate intent.

# ============================================

#ToDo: resume lecture from 6:22:32


                                        ##################
                                        ##################
                                        # POSTED THIS FAR
                                        # Date 22 July, 2020
                                        # Title: Tuples
                                        ##################
                                        ##################

# ===========================================
# ===========================================
# ===========================================
# ===========================================

######## CHAPTER 11
######## REGULAR EXPRESSIONS -- A Fun and Interesting Topic
######## 5:55:24

# Regular Expression / Regex / Regexp
# In computing they provides a concise and flexible means for matching "strings" of text,
    # SUch as: particular characters, words, or patterns of characters.
# They (regex) are written in formal language that can be interpreted by a "regex-processor".

# Uses of Regex:
    # you can do smart searching
# Regex are really clever "wild card" expressions for matching and parsing strings.
    # They are almost programmable wild card expressions, there is no looping, but there is looping.
        # and there is all those implicit things. e.g. you say look for patterns that look like this or that.
            # and then you get back things that match those pattern.

# We do searching for everything. Regex are very structured way to go about "searching" for information or "finding" a particular outcome.

# ----------------------------

### Understanding Regular Expressions

# - very powerful and quite cryptic (with Mysterious Meaning / hard to understand)
# - Fun once you understand them
# - Regular expressions are language unto themselves
# - A language of "Market character" - programming with characters
# - It is kind of an "old school" language - compact. It comes from around 1960


# ============================================

### Regular Expression Quick Guide

#       ^           Matches the "beginning" of a line
#       $           Matches the "end" of a line
#       .           Matches "any" character
#       \s          Matches "whitespace"
#       \S          Matches any "non-whitespace" character
#       *           Repeats a character "zero" or "more" times
#       *?          Repeats a character "zero" or "more" times (non-greedy)
#       +           Repeats a character "one" or "more" times
#       +?          Repeats a character "one" or "more" times (non-greedy)
#       [aeiou]     Matches a single character "in" the listed "set"
#       [^XYZ]      Matches a single character "not in" the listed "set"
#       [a-z0-9]    The set of characters can include a "range"
#       (           Indicates where string "extraction" is to start
#       )           Indicates where a string "extraction" is to end


# ============================================

### The Regexp Module

#   To use regex, import the library using "import re"
        # re-library-dir:
            # ['A', 'ASCII', 'DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE',
            # 'M', 'MULTILINE', 'Match', 'Pattern', 'RegexFlag', 'S', 'Scanner',
            # 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE',
            # '__all__', '__builtins__', '__cached__', '__doc__', '__file__',
            # '__loader__', '__name__', '__package__', '__spec__', '__version__',
            # '_cache', '_compile', '_compile_repl', '_expand', '_locale',
            # '_pickle', '_special_chars_map', '_subx', 'compile', 'copyreg',
            # 'enum', 'error', 'escape', 'findall', 'finditer', 'fullmatch',
            # 'functools', 'match', 'purge', 'search', 'split', 'sre_compile',
            # 'sre_parse', 'sub', 'subn', 'template']

            ### Following are just one characters in dir-regex-library:
                # A, I, L, M, S, T U, X

#   You can use "re.search()" to see if a string matches a regular expression;
        # similar to using the "find()" method for strings

#   You can use "re.findall()" to extract portions of a string that matches your regular expression;
        # similar to a combination of "find()" and slicing: "var[5:10]"


# ============================================
# Code without "Regex"
hand = open('emaildata')
for line in hand:
    line = line.rstrip()        # using ".rstrip()" to avoid new line at the end of every line
    if line.find('From: ') >= 0:        # here, alternatively, we can use regex to find particular characters in file.
        print(line)
# ---------------

# code with regular expression library.

# Code without "Regex"
#hand = open('emaildata')
#for line in hand:
#    line = line.rstrip()        # using ".rstrip()" to avoid new line at the end of every line
#    if line.find('From: ') >= 0:        # here we can use regex to find particular characters in file.
#        print(line)
# ---------------

import re                               # has to import library, otherwise regex-code will not work.

hand = open('emaildata')
for line in hand:
    line = line.rstrip()
    if re.search('From: ', line):
        print(line)

# we will say "    if re.search('From: ', line): # like this:
    # within the library regular expression. go find the "search-function", and search for the string "from: ", in the string "line".


# -----------------------
# -----------------------
# -----------------------

# using re.search() like startswith().

# We fine-tune what is matched by adding special character to the string.


hand = open('emaildata')

for line in hand:
    line = line.rstrip()
    if line.startswith('From: '):
        print(line)

# -----------------------
# -----------------------

## Alternatively to ".startswith(), regex-code uses "^" character.

#hand = open('emaildata')

#for line in hand:
#    line = line.rstrip()
#    if line.startswith('From: '):
#        print(line)

import re

hand = open('emaildata')

for line in hand:
    line = line.rstrip()
    if re.search("^From: ", line):      # ^ -- matches the beginning of a line. So, this code will then ignore if "From:" appears in the middle of the line.
        print(line)

# "^" -- played the same role as ".startswith()"
    # Hence fine tuning the search.
# using "^" -- we have turned the first parameter into "code". "^From: ".  now its a code.

# So the difference is: in other case we look for the method like ".startswith()" but in regex we programe the regular expression like "^From: ".
# ^ == carrot       --      that's the pronounciation.
# you runout of methods in the string / class long before the things you can to with Regular Expression (or Regex/Regexp)



# ============================================

### Wild-Card Characters ###

# ^X.*:
#   ^ = match at start of line
#   X = a character that matches itself
#   . = a wild-card matches any character
#   * = Many characters

import re

hand = open('emaildata')

for line in hand:
    line = line.rstrip()
    if re.search("^X.*", line):
        print(line)

# --------------------
# --------------------

## Fine-Tuning your Match ##
import re

hand = open('emaildata')

for line in hand:
    line = line.rstrip()
#    if re.search("^X.*:", line):
#        print(line)

# Depending on how "clean" your data is and the purpose of your application, you may want to narrow your match down a bit.

    if re.search("^X-\S+:", line):
        print(line)

# ==========================

## From Matching to extraction. -- starts @ 6:05:42

# What we will do?
    # Use what we have learned in previous chapter to pull data out of strings, using regular-expression-library.

## Matching and EXtracting ##

import re

x = 'My 2 favorite numbers are 19 and 42'

# using only: "[0-9]"
y = re.findall('[0-9]', x)
print(y)                                # ['2', '1', '9', '4', '2']

# using with "+" sign: "[0-9]+"
y = re.findall('[0-9]+', x)
print(y)                                # ['2', '19', '42']
# ---------------------------
# ---------------------------

y = re.findall('[0-4]', x)
print(y)                                # ['2', '1', '4', '2']

# using with "+" sign: "[0-9]+"
y = re.findall('[0-4]+', x)
print(y)                                # ['2', '1', '42']

# So, [0-9] is just a range we can manipulate as per our requirement.

# ---------------------------
# ---------------------------


# "re.search()" return a True/False depending on whether the string matches the regular expression.
# if we actually want the matching string to be extracted, we use "re.findall()
# The output of .findall will be in the form of a list.


# Just for fun:::

y = re.search('[0-9]', x)
print(y)                                # <re.Match object; span=(3, 4), match='2'>

y = re.search('[0-9]+', x)
print(y)                                # <re.Match object; span=(3, 4), match='2'>

#So, re.search() is not working same as re.findall. infact its not working at all i guess.

# But in "re.search()" changing range to [0-1] gives different results


y = re.search('[0-1]', x)
print(y)                                # <re.Match object; span=(26, 27), match='1'>

y = re.search('[0-1]+', x)
print(y)                                # <re.Match object; span=(26, 27), match='1'>

# But i am still confused, what does all this means.

# Remamber: instructer said that search gives True/False. Let's see when i can confirm it again in comparison with current problem.

# -----------------------------
# -----------------------------

# "[AEIOU]", setting parameter of alphabets.


# in case nothing match in .findall() then it gives an empty list, not True/False like .search()
# in .findall(), operation will be case sensitive. AND watch for the specific alphabet you are looking for


import re

x = 'My 2 favorite numbers are 19 and 42'

y = re.findall('[0-9]+', x)
print(y)                                # ['2', '19', '42']

y = re.findall('[AEIOU]+', x)
print(y)                                # []

y = re.findall('[AEIOU]', x)
print(y)                                # []

# ---------------
y = re.findall('[MFtv]+', x)
print(y)                                # ['M', 'v', 't']

# Hence, its case sensitive and look only for the alphabets you are asking for


# ------------------------
y = re.findall('[a-z]', x)
print(y)                                # ['y', 'f', 'a', 'v', 'o', 'r', 'i', 't', 'e', 'n', 'u', 'm', 'b', 'e', 'r', 's', 'a', 'r', 'e', 'a', 'n', 'd']

y = re.findall('[a-z]+', x)
print(y)                                # ['y', 'favorite', 'numbers', 'are', 'and']

# Note that it does not extract capital letter "M".

# ------------------------

y = re.findall('[aeiou]', x)
print(y)                                # ['a', 'o', 'i', 'e', 'u', 'e', 'a', 'e', 'a']


y = re.findall('[aeiou]+', x)
print(y)                                # ['a', 'o', 'i', 'e', 'u', 'e', 'a', 'e', 'a']

# ============================================
# ============================================

# Warning: Greedy Matching
# Warning: Greedy Matching
# Warning: Greedy Matching
# Warning: Greedy Matching

# the "repeat" characters (* and +) push "outward" in both directions (greedy) to match the largest possible string.
# It chooses the largest possible (of the overlaping) string
# Anytime it has option, it chooses the largest one.

import re

x = 'From: Using the : character'
y = re.findall('^F.+:', x)  # ['From: Using the :']
print(y)

# Why not 'From:'?
# Explaination:
#   ^F  - First character in the match is an F
#   :   - Last character in the match is a :
#   .+  - One or more characters

# ============================================
# ============================================

## Non-Greedy MAtching

# if you add a "?" character, the "+" and "*" chill out a bit;
    # adding ? means, it prefers the shorter of the strings.

import re

x = 'From: Using the : character'
y = re.findall('^F.+?:', x)          # ['From:']
print(y)

# Why this time only 'From:'?
# Explaination:
    #   ^F  - First character in the match is an F
    #   :   - Last character in the match is a :
    #   .+?  - One or more characters "but" not greedy

# --------------------------------
# Greedy is the default
# Non-Greedy is the optional.
# --------------------------------
# ============================================

# Fine-Tuning String Extraction #

# you can refine the match for "re.findall()" and separately determine which portion of the match -
# is to be extracted by using parenthesis "()".

# Though we have done this with other techniques as well.

import re
x = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'
y = re.findall('\S+@\S+', x)
print(y)                    # ['stephen.marquard@uct.ac.za']

# if not using + (i.e. one or more character) it would stop just around @:

y = re.findall('\S@\S', x)
print(y)                    # ['d@u']


# --------------------------------

# OR:
y = re.findall('\S+@\S', x)
print(y)                    # ['stephen.marquard@u']

# or:
y = re.findall('\S@\S+', x)
print(y)                    # ['d@uct.ac.za']

# --------------------------------
# OR:
y = re.findall('@\S', x)
print(y)                    # ['@u']
# or:
y = re.findall('\S@', x)
print(y)                    # ['d@']


# ---------------------------
# ---------------------------

#======================================

# Fine-Tuning String Extraction #
# Paranthesis "()" are not part of mathc
    # - but they tell where to "start" and "stop", what string to extract.

# We can give it a matching string that is different from the extracting string by adding paranthesis "()".

import re
x = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'
# Without ():
y = re.findall('From \S+@\S+', x)
print(y)                        # ['From stephen.marquard@uct.ac.za']

# again without ():
y = re.findall('^From \S+@\S+', x)
print(y)                        # ['From stephen.marquard@uct.ac.za']

# With ():
y = re.findall('^From (\S+@\S+)', x)
print(y)                        # ['stephen.marquard@uct.ac.za']


# ============================================
# ============================================
# ============================================

# String PArcing Examples # -- Starts @ 6:14:01

# Now we are going to know couple of ways to use these new found skills.

## Regular Expressions: String Parcing @ 6:14:08

# some more practical applications or regular expression:

# 1: extracting a host name - using find and string slicing:

data = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'
atpos = data.find('@')
print(atpos)                        # 21
sppos = data.find(' ', atpos)
print(sppos)                        # 31
host = data[atpos+1: sppos]
print(host)                         # uct.ac.za

# ----------------------------

# Now we are going to know couple of ways to use these new found skills.

## Regular Expressions: String Parcing @ 6:14:08

# Double Split Pattern -- @nd method


line = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'

words = line.split()
print(words)                    # ['From', 'stephen.marquard@uct.ac.za', 'Sat', 'Jan', '5', '09:14:16', '2008']
email = words[1]
pieces = email.split('@')
print(pieces)                   # ['stephen.marquard', 'uct.ac.za']

print(pieces[1])                # uct.ac.za

# ----------------------------

# Regex Expression

import re

lin = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'

y = re.findall('@\S+', lin)
print(y)                        # ['@uct.ac.za']    -- But his includes "@" in it.

# So better solution for this particular requirement will be: (eith the help of "paranthesis-()")

y = re.findall('@(\S+)', lin)
print(y)                        # ['uct.ac.za']

# OR

y = re.findall('@([^ ]*)', lin)
print(y)                        # ['uct.ac.za']

    # Explaination:
        # [^ ]  --  means match non-blank character
        # *     --  Match many of them


# Fine Tuning

y = re.findall('^From .*@([^ ]*)', lin)
print(y)

        # Keep check on spacing between characters.
# ============================================

# Spam Confidence #
import re
hand = open('emaildata')
numlist = list()
for line in hand:
    line = line.rstrip()
    stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)

#print(numlist)              # []   -- commented out
    # explaination:
        # if the line does't have this -- e.g. "0.8475" or missing in some way
            # whether prefix missing or the number is missing
            # its gonna fail too.
            # then we are going to get empty list "[]".

    # so the firs thing you gotta do is to check if you got a match.
    # this will be done by adding if statement. That will skip all lines that do not match.

    if len(stuff) != 1:
        continue                    # means skip all lines that have length of stuff not equal to 1, i.e. they have zero or more than one items. but we need to compare only one item in one line with one item from all other lines
    # print(stuff)                  # commented out.
    num = float(stuff[0])
    numlist.append(num)
# print(numlist)                      # [0.8475, 0.6178, 0.6961, 0.7565, 0.7626, 0.7556, 0.7002, 0.7615, 0.7601, 0.7605, 0.6959, 0.7606, 0.7559, 0.7605, 0.6932, 0.7558, 0.6526, 0.6948, 0.6528, 0.7002, 0.7554, 0.6956, 0.6959, 0.7556, 0.9846, 0.8509, 0.9907]
print('Maximum: ', max(numlist))    # Maximum:  0.9907

# Clean code of above example:

# Spam Confidence #
import re
hand = open('emaildata')
numlist = list()
for line in hand:
    line = line.rstrip()
    stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)
    if len(stuff) != 1:
        continue
    num = float(stuff[0])
    numlist.append(num)
print('Maximum: ', max(numlist))    # Maximum:  0.9907

# ============================================
# Escape Character #

# if you want a special regular expression character (e.g $ -- MAtches the end of the line) to just behave normally
    # (most of the time) you prefix it with "\" -- (that is back slash).

import re

x = 'We just received $ 10.00 for cookies'
y = re.findall('\$ [0-9.]+', x)
print(y)                                # ['$ 10.00']

# where:
    #   \$          a real dollar sign
    #   [0-9.]      a digit or period
    #   +           atleast one or more.


pak = 'We just received 10.00 pkr for cookies'
y = re.findall('[0-9.]+', x)
print(y)                                # ['10.00']     # But i have many doubts about this code. lets see in future.

# ============================================
        ### Summary ###

    # Regular expressions are a cryptic but powerful language for
     # matching strings and extracting elements from those strings.

    # Regular expressions have special characters that indicate intent.



# ============================================
# ============================================
# ============================================





#ToDo: resume lecture from 6:22:32

Search This Blog

Lecture Reference

Rgular Expressions / Regex / Regexp

Comments

Post a Comment