Lunarpedia:Autostub3/Source
Source code as of the partially successful execution that stopped after 327 entries.
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%# # # # PreAutostub 3 # # # # A specialized tool to feed a specific group of # # HTML sources into Autostub3 # # # # PUBLIC DOMAIN # # # #%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%# TESTNO = "Gamma_A" #? lost count here #Parser def Parser(List): #Setup variables and buffers ListPosition = 0 # position in List; forgot why I needed this TagID = "" # buffer for the current tag LinkID = "" # buffer for the link name Name = "" # this is hopefully the correct name for the article Output = "" # output text mode = "Text" # what mode we're in Flag = "" # Is this article flagged for something? Named = 0 # Is this already named? Defno = 1 # Which definition? #Loop for q in List: # Text Mode -- dumps text directly to output buffer until put into # another mode; ends with a '<' and switches to tag mode if mode == "Text": if q == "<": mode = "Tag" TagID = "" else: Output += q # Tag mode -- stores text into TagID buffer; ends with a '>', # identifies the ID buffer and switches to the appropriate mode or # outputs the appropriate text and returns to Text mode. # 'A' -- enter A mode # 'STRONG' -- enter Strong mode # 'I' or '/I' -- output "''" # 'B' or '/B' -- output "'''" # 'MATH' or '/MATH' -- output '<math>' or '</math>' # 'SUP' or '/SUP' -- output '<sup>' or '</sup>' # 'SUB' or '/SUB' -- output '<sub>' or '</sub>' # 'DD' -- output '<BR/><BR/>' # 'BR' or 'BR/' -- output '<BR/>' # 'CITE' or '/CITE' -- output 'CITE' or '/CITE' # 'IMG' -- set missing image tag; return name of missing image # P -- replace with nothing # unknown: return verbatim in angle brackets and switch to Text mode elif mode == "Tag": if q == ">": if ReadTag(TagID) == "A": mode = "A" elif ReadTag(TagID) == "STRONG": if not Named: mode = "Strong" else: mode = "End" Output += "'''" elif ReadTag(TagID) == "I": Output += "''" mode = "Text" elif ReadTag(TagID) == "B": Output += "'''" mode = "Text" elif ReadTag(TagID) == "/I": Output += "''" mode = "Text" elif ReadTag(TagID) == "/B": Output += "'''" mode = "Text" elif ReadTag(TagID) == "CITE": Output += "<ref>" mode = "Text" Flag += "R" elif ReadTag(TagID) == "/CITE": Output += "</ref>" mode = "Text" elif ReadTag(TagID) == "/A": mode = "Text" elif ReadTag(TagID) == "P": Output += "<BR/>" mode = "Text" elif ReadTag(TagID) == "MATH": Output += "<"+TagID.lower()+">" mode = "Text" elif ReadTag(TagID) == "/MATH": Output += "</math>" mode = "Text" elif ReadTag(TagID) == "DD": Output += "<BR/>" # '''"+str(Defno)+".'''" Defno += 1 mode = "Text" elif ReadTag(TagID) == "IMG": Output += "'''Missing Image:"+TagID+"'''" mode = "Text" Flag += "I" else: Output += "<"+TagID+">" mode = "Text" else: TagID += q # A mode -- stores text into LinkID buffer; ends with a '<', outputs # a Wikilink version of the text accumilated in its buffer. elif mode == "A": if q == "<": if LinkID != "": Output += "[["+CapIt(LinkID)+"|"+LinkID+"]]" mode = "End" LinkID = "" else: LinkID += q # Strong mode -- stores text into Name buffer; ends with a '<' and # capitalizes elif mode == "Strong": if q == "<": Name = CapIt(Name) mode = "End" Named = 1 else: Name += q # Weak mode -- when the boson is hit by a neutrino it changes the # flavor of the nearest... Oh. Sorry, wrong model. # End mode -- ignores all text until the ending '>' is recieved; # switches to Text mode. Nested tags are not supported and it is # assumed that any new tag is the correct end tag. elif mode == "End": if q == ">": mode = "Text" # Um, what mode are we in again? else: raise TypeError, "Parser internal error: WTF is "+mode+" mode??" # Incriment ListPosition # Q: Do I still need this or did I eliminate its usefulness? ListPosition += 1 return [Output, Name, Flag] def KingMe(stringy): '''capitalizer that doesn't uncapitalize''' return stringy[0].upper()+stringy[1:] def ReadTag(rawtext): # should return the leftmost part of the string in SCREAMING CAPITALS. return rawtext.split(' ',1)[0].upper().strip('\\. ') def CapIt(Name): # Needs to consistently return a Capitalized Form of whatever name is # plugged into it Name = Name.split() CapName = "" Terms = 0 for q in Name: if Terms == 0: CapName = KingMe(q.strip(',.')) Terms = 1 else: CapName = CapName + " " + KingMe(q.strip(',.')) return CapName def FindRedir(gunk): link = [] foundlink = 0 buffer = "" mode = "ignore" Count = 0 # was added only for debugging Modemap = "" # also added only for debugging for q in gunk: if mode == "ignore": Modemap += "i" if q == "[": mode = "maybe" elif mode == "maybe": Modemap += "m" if q == "[": mode = "link" else: mode = "ignore" #; print "FAILED LINK" elif mode == "link": Modemap += "L" if ((q == "|") or (q == "]")): foundlink += 1 mode = "ignore" link += [buffer] buffer = "" # print "found link at "+str(Count) else: buffer += q Count += 1 #print Modemap #print "foundlink ==", foundlink, link if foundlink: if len (link) > 1: # print link themax = 0 # length of longest theout = None # which is longest for q in link: # print len(q), themax if len(q) > themax: themax=len(q) theout = q # print theout return theout else: return link[0] else: return None #Get raw html #RawML = raw_input() RawML = "" tsv=open('/home/Luna/Raw_SP-7_A.txt') RawML += tsv.read() tsv.close() #/home/Luna/Raw_SP-7_B.txt #tsv=open('/home/Luna/Raw_SP-7_B.txt_') #RawML += tsv.read() #tsv.close() # #tsv=open('/home/Luna/Raw_SP-7_C.txt_') #RawML += tsv.read() #tsv.close() #Parse Page into slices using <DT> tags RawList = RawML.split('<DT>') #Create ProtoArticles and Redirects and stuff them full of proto-articles ProtoArticles = [] Redirects = [] bailout = 0 punchout = 0 for q in RawList: if q != "": artie = Parser(q) if artie[0].count("constellation"): print "constellation check removing entry:",artie[1] elif (artie[0].count('=',0,5) or artie[0].upper().count('SEE',0,20)): #print artie artie2 = FindRedir(artie[0]) #print artie2 if artie2: if artie[1] == "ADF (abbr)": artie[1] = "ADF" elif artie[1] == "ADP (abbr)": artie[1] = "ADP" elif artie[1] == "AFC (abbr)": artie[1] = "AFC" elif artie[1] == "Andromeda (abbr And Andr)": artie[1] = "Andromeda" elif artie[1] == "Antlia (abbr Ant Antl.)": artie[1] = "Antlia" elif artie[1] == "APU (abbr)": artie[1] = "APU" elif artie[1] == "AU (abbr)": artie[1] = "AU" elif artie[1] == "Atomic Mass Unit (abbr Amu)": artie[1] = "Atomic Mass Unit" elif artie[1] == "Atomic Weight Unit (abbr Awu)": artie[1] = "Atomic Weight Unit" elif artie[1] == "Anti-g Suit": artie[1] = "Anti G Suit" elif artie[1] == "": artie[1] = "" elif artie[1] == "": artie[1] = "" elif artie[1] == "": punchout = 1 #table for collections and bailouts if not punchout: Redirects += [(artie[1],artie2)] else: punchout = 0 #print "artie2",artie2 #print Parser(q)[2] #print "*****" else: #name patch list if artie[1] == "Acoustic Velocity (": artie[1] = "Acoustic Velocity" elif artie[1] == "(abbr ADC)": artie[1] = "ADC" elif artie[1] == "AND-NOT Gate = Exclusive OR Circuit": bailout = 1 elif artie[1] == "Angular Acceleration (": artie[1] = "Angular Acceleration" elif artie[1] == "Angular Velocity (": artie[1] = "Angular Velocity" elif artie[1] == "Astronomical Unit (abbr AU)": artie[1] = "Astronomical Unit" elif artie[1] == "Atomic Weight (abbr At Wt.)": artie[1] = "Atomic Weight" elif artie[1] == "Automatic Direction Finder (abbr ADF)": artie[1] = "Automatic Direction Finder" elif artie[1] == "Automatic Frequency Control (abbr AFC)": artie[1] = "Automatic Frequency Control" elif artie[1] == "Automatic Gain Control (abbr AGC)": artie[1] = "Automatic Gain Control" elif artie[1] == "Auxiliary Power Unit (abbr APU)": artie[1] = "Auxiliary Power Unit" elif artie[1] == "Axis (plural Axes)": artie[1] = "Axis" elif artie[1] == "Acceleration Of Gravity (": artie[1] = "Acceleration Of Gravity" elif artie[1] == "Air Position Indicator (abbr API)": artie[1] = "Air Position Indicator" elif artie[1] == "[[]]": bailout = 1 elif artie[1] == "Alphanumeric (alphabet Plus Numeric)": artie[1] = "Alphanumeric" elif artie[1] == "Ampere (abbr A)": artie[1] = "Ampere" elif artie[1] == "AND Gate And Gate": artie[1] = "AND Gate" elif artie[1] == "Anti-matter": artie[1] = "Antimatter" elif artie[1] == "Anti-particle": artie[1] = "Antiparticle" elif artie[1] == "": bailout = 1 elif artie[1] == "Alga (plural Algae)": artie[1] = "Algae" elif artie[1] == "B��y Chair": artie[1] = "Bárány Chair" elif artie[1] == "Barn (Abbr B)": artie[1] = "Barn" elif artie[1] == "(abbr BT Sequencing)": artie[1] = "BT Sequencing" elif artie[1] == "Baum�scale (abbr Be)": artie[1] = "Baumé scale" elif artie[1] == "Bernoulli Law Or Bernoulli Theorem": artie[1] = "Bernoulli's Law" elif artie[1] == "Bluntness (": artie[1] = "Bluntness" elif artie[1] == "Boiling Point (abbr Bp)": artie[1] = "Boiling Point" elif artie[1] == "Bohr Magneton Electronic Bohr Magneton": artie[1] = "Bohr Magneton" elif artie[1] == "Boltzmann Constant (symbol": artie[1] = "Boltzmann Constant" elif artie[1] == "Btu (abbr)": artie[1] = "BTU" elif artie[1] == "Cae Cael": bailout = 1 elif artie[1] == "Calorie (abbr Cal)": artie[1] = "Calorie" elif artie[1] == "Caml": bailout = 1 elif artie[1] == "Cam Caml": bailout = 1 elif artie[1] == "Cap Capr": bailout = 1 elif artie[1] == "Car Cari": bailout = 1 elif artie[1] == "Cet Ceti": bailout = 1 elif artie[1] == "Cha Cham": bailout = 1 elif artie[1] == "Cir Circ": bailout = 1 elif artie[1] == "CMa C Maj": bailout = 1 elif artie[1] == "Carrier Wave (abbr Cw)": artie[1] = "Carrier Wave" elif artie[1] == "Cathode-ray Tube (abbr CRT)": artie[1] = "Cathode Ray Tube" elif artie[1] == "Cathode-ray Oscilloscope": artie[1] = "Cathode Ray Oscilloscope" elif artie[1] == "Celsius Temperature Scale (abbr C)": artie[1] = "Celsius Temperature Scale" elif artie[1] == "Centigrade Temperature Scale (abbr C)": artie[1] = "Centigrade Temperature Scale" elif artie[1] == "Centimeter (abbr Cm)": artie[1] = "Centimeter" elif artie[1] == "Centimeter-gram-second System (abbr Cgs)": artie[1] = "Centimeter-gram-second System" elif artie[1] == "Centipoise (abbr Cp)": artie[1] = "Centipoise" elif artie[1] == "Cermet [ceramic + Metal]": artie[1] = "Cermet" elif artie[1] == "Circle Of Equal Probability (abbr CEP)": artie[1] = "Circle Of Equal Probability" elif artie[1] == "Circular Dispersion (abbr CD)": artie[1] = "Circular Dispersion" elif artie[1] == "Co": artie[1] = "Co (prefix)" elif artie[1] == "Coefficient (abbr Coeff)": artie[1] = "Coefficient" elif artie[1] == "Coherent Oscillator (abbr Coho)": artie[1] = "Coherent Oscillator" elif artie[1] == "Comes (plural Comites)": artie[1] = "Comes" elif artie[1] == "Continuous Waves (abbr CW)": artie[1] = "Continuous Waves" elif artie[1] == "CMi C Min": bailout = 1 elif artie[1] == "Cnc Canc": bailout = 1 elif artie[1] == "Colm": bailout = 1 elif artie[1] == "Com Coma": bailout = 1 elif artie[1] == "Copy": bailout = 1 elif artie[1] == "Cor A": bailout = 1 elif artie[1] == "Cor B": bailout = 1 elif artie[1] == "CrA Cor A": bailout = 1 elif artie[1] == "Coolant (": artie[1] = "Coolant" elif artie[1] == "Correlation Tracking And Ranging (abbr Cotar)": artie[1] = "Correlation Tracking And Ranging" elif artie[1] == "Correlation Tracking And Triangulation (abbr Cotat)": artie[1] = "Correlation Tracking And Triangulation" elif artie[1] == "Coulomb (abbr C)": artie[1] = "Coulomb" elif artie[1] == "Corv": artie[1] = "" elif artie[1] == "Corvus": artie[1] = "" elif artie[1] == "CRT (abbr)": bailout = 1 elif artie[1] == "Curie (abbr C)": artie[1] = "Curie" elif artie[1] == "Caelum (abbr Cae Cael)": artie[1] = "Caelum" elif artie[1] == "Bit Rate": artie[1] = "Bitrate" elif artie[1] == "Black Body Blackbody": artie[1] = "Black Body" elif artie[1] == "Black-body Radiation": artie[1] = "Black Body Radiation" elif artie[1] == "Body Of Revolution": artie[1] = "Body of Revolution" elif artie[1] == "": artie[1] = "" elif artie[1] == "CrB Cor B": bailout = 1 elif artie[1] == "Crt Crat": bailout = 1 elif artie[1] == "Cruc": bailout = 1 elif artie[1] == "Crv Corv": bailout = 1 elif artie[1] == "Cvn C Ven": bailout = 1 elif artie[1] == "Cyg Cygn": bailout = 1 elif artie[1] == "Aps Apus": bailout = 1 elif artie[1] == "Aql Aqil": bailout = 1 elif artie[1] == "Ari Arie": bailout = 1 elif artie[1] == "Aur Auri": bailout = 1 #elif artie[1] == "": artie[1] = "" #elif artie[1] == "": artie[1] = "" #elif artie[1] == "": artie[1] = "" #elif artie[1] == "": artie[1] = "" #elif artie[1] == "": artie[1] = "" #elif artie[1] == "": artie[1] = "" #elif artie[1] == "": artie[1] = "" #elif artie[1] == "": artie[1] = "" elif artie[1] == "Ares": bailout = 1 #elif artie[1] == "": bailout = 1 #elif artie[1] == "": bailout = 1 #elif artie[1] == "": bailout = 1 #elif artie[1] == "": bailout = 1 #elif artie[1] == "": bailout = 1 #elif artie[1] == "": bailout = 1 #add to final list Acceleration Of Gravity ( if not bailout: ProtoArticles += [artie] else: bailout = 0 #Redirect-Creation-O-Mat if artie[1].count(" "): Redirects += [[artie[1].capitalize(),"#REDIRECT: [["+artie[1]+"]]"]] ##print "creating: "+artie[1].capitalize()+" -- #REDIRECT: [["+artie[1]+"]]" print len(ProtoArticles), "articles" print len(Redirects), "redirects" ##qqq=0 ##for q in ProtoArticles: ## if q[1] == "Aurora": print "aurora is",qqq ## if q[1] == "Absolute Magnitude": print "absolute magnitude is",qqq ## qqq += 1 #%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%# # # # Definition Autostub Generator # # # # Public Domain # # # #%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%# # # # from PreAutostub3: # # # # ProtoArticles (article text, article name, flags) # # Redirects (redirect name, redirect to this article) # # # def XMLproof(textish): outp = "" for q in textish: if q == "<": outp += "<" elif q == ">": outp += ">" elif q == "&": outp += "&" elif q == '"': outp += """ ##elif q == ":": outp += "%3A" else: outp += q return outp def StartXML(): out = [['<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">\n']] out += [' <siteinfo>\n'] out += [' <sitename>Lunarpedia</sitename>\n'] out += [' </siteinfo>\n'] return out def EndXML(): out = [['</mediawiki>\n']] return out def ArtXML(title, contrib, date, text): '''XML markup for article in file title -- title of article contrib -- name of script (ie Autostub2) text -- the article ''' out = [[' <page>\n']] out += [' <title>'+title+'</title>\n'] out += [' <revision>\n'] out += [' <timestamp>'+date+'</timestamp>'] out += [' <contributor>\n'] out += [' <username>'+contrib+'</username>\n'] out += [' </contributor>\n'] ##out += [' <text xml:space="preserve">'+text+'</text>'] out += [' <text xml:space="preserve">'] ##print text out += text out += ['</text>\n'] out += [' </revision>\n'] out += [' </page>\n'] return out def linebrk(listish): newlist = [] for q in listish: newlist += [q+"\n"] return newlist def Stringify(listish): stringish = "" for q in listish: if type(q) == type('str'): stringish += q #print "str" elif type(q) == type([]): stringish += Stringify(q) #print "list" else: print type(q) print q raise TypeError, "non-string non-list!!!" return stringish def dodef(thisentry): """Create stub article from sequence """ global TESTNO # # 0 -- body of definition article # 1 -- name of article # 2 -- flags # I: missing image # R: needs references section # #%#%#%#%#%#%#%#%#%#%#%#%#%#%%#%#%#%#%# # # #%#%#%#%#%#%#%#%#%#%#%#%#%#%%#%#%#%#%# # # start generating the article here: # to_out = [] ##to_out = ["{{Script Test}}"] to_out += ["{{Autostub}}"] to_out += ["{{Initial Proof Needed}}"] ##print thisentry[0] ##print "*****" ##print thisentry[1] ##print "*****" ##print thisentry[2] to_out += ["'''"+thisentry[1]+"'''"] to_out += [thisentry[0]] #print "*****" #print to_out to_out += ["==References=="] to_out += ["''This article is based on NASA's [[NASA SP-7|Dictionary of Technical Terms for Aerospace Use]]''"] if thisentry[2].count("R"): to_out += ["<references/>"] to_out += ["[[Category%3ADefinitions]]"] to_out += ["[[Category%3ANASA SP-7]]"] if thisentry[2].count("I"): to_out += ["[[Category%3ADefinitions with Missing Images]]"] ##to_out += [""] ##to_out += ["<!-- Generated by a gamma candidate version of Autostub3 (Test "+TESTNO+") -->"] to_out = linebrk(to_out) return to_out def doredir(thisentry): """Create redirect from two item sequence """ # # 0 -- name of redirect # 1 -- redirect to this article # #%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%# # # #%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%# # # start generating the redirect here: # return ["#REDIRECT: [["+thisentry[1]+"]]"] def doare(tup): proofme = doredir(tup) proofed = [] for q in proofme: proofed += XMLproof(q) return ArtXML(tup[0], "Autostub3", "2007-03-07T18:00:00Z", proofed) def doanart(tup): proofme = dodef(tup) proofed = [] for q in proofme: proofed += XMLproof(q) return ArtXML(tup[1], "Autostub3", "2007-03-07T18:00:00Z", proofed) execute = StartXML() for q in ProtoArticles: execute += doanart(q) for q in Redirects: execute += doare(q) ##test4 += doanart(ProtoArticles[600]) ##test4 += doanart(ProtoArticles[690]) ##test4 += doare(Redirects[130]) ##test4 += doare(Redirects[140]) ##test4 += doare(Redirects[159]) ##print doare(ProtoArticles[50]) ##print "*****" ##print ##print dodef(ProtoArticles[117]) execute += EndXML() execute = Stringify(execute) ##print test4 do_xml=open("/home/Luna/autostub3test"+TESTNO+".xml", 'w') do_xml.write(execute) do_xml.close() #create and sort table to avoid name collisions ##testlist = [] ##redund = [] ## ##for q in ProtoArticles: ## testlist += ["[["+q[1]+"]] article<BR/>"] ## redund += [q[1]] ## ##for q in Redirects: ## testlist += ["[["+q[0]+"]] redirect<BR/>"] ## redund += [q[0]] ## ##print "sorting" ##testlist.sort() ##redund.sort() ## ###check for interproject name collisions ##past = "configio.$$$" ##for q in redund: ## if q == past: print "Collision:",q ## past = q ## ##print "sorted:" ##print ##for q in testlist: print q